Beispiel #1
0
def runAll(args):
  
	print('\n\n\nYou have requested to count unique sam files')
	print('\tWARNING:')
	print('\t\tIF USING ANY REFERENCES OTHER THAN THOSE I PROVIDE I CANNOT GUARANTEE RESULT ACCURACY')
	print('\n')
  


	#set up environment#
	args.SamDirectory = common.fixDirName(args.SamDirectory)
	
	countDir = os.path.dirname(args.SamDirectory[:-1]) + '/BinCounts/'
	if args.output:
		countDir = common.fixDirName(args.output)
	
	statsDir = os.path.dirname(args.SamDirectory[:-1]) + '/PipelineStats/'
	if args.statdir:
		statsDir = common.fixDirName(args.statdir)
			
	for i in [countDir, statsDir]:
		common.makeDir(i)

	samFiles = common.getSampleList(args.SamDirectory, args.samples, 'sam')
		
		
	
	#run multiprocessing of all bin counting commands#	
	argList = [(x, countDir, statsDir, args.species) for x in samFiles]
	common.daemon(countfile.runOne, argList, 'count sam files')
	
	
	
	print('\nBin counts complete\n\n\n')
Beispiel #2
0
def runAll(args):
	
	print('\n\n\nYou have requested preprocess (trim) fastq files')
	print('\tWARNING:')
	print('\t\tIF USING ANY LENGTH OTHER THAN 36 BP, REFERENCE FILES ARE NOT SUPPORTED FOR DOWNSTREAM PROCESSING')
	print('\n')
	

	#make sure environment is properly prepared#
	args.FastqDirectory = common.fixDirName(args.FastqDirectory)
	if not args.remove:
		common.makeDir(args.FastqDirectory + '/FullLength/')

	
		
	#get list of fastq files to process (depending on args.samples)
	fastqFiles = common.getSampleList(args.FastqDirectory, args.samples, 'fastq')
	
	
	
	#use the daemon to and preprocessing code to trim all fastq files with parallel processing
		
	if args.remove:
		argList = [(x, args.trim5, args.length, remove=True,) for x in fastqFiles]
	else:
		argList = [(x, args.trim5, args.length,) for x in fastqFiles]
		
	common.daemon(trimfile.preprocessOne, argList, 'trim sequencing reads to desired length', cpuPerProcess=1)
		
	
	
	print('\nPre-processing complete\n\n\n')
Beispiel #3
0
def runAll(args):
	
	print('\n\n\nYou have requested to map fastq files')
	print('\tWARNING:')
	print('\t\tIF USING ANY REFERENCES OTHER THAN THOSE I PROVIDE I CANNOT GUARANTEE RESULT ACCURACY')
	print('\n')

	
	
	#set up environment#
	args.FastqDirectory = common.fixDirName(args.FastqDirectory)
	
	samDir = os.path.dirname(args.FastqDirectory[:-1]) + '/Sam/'
	if args.output:
		samDir = common.fixDirName(args.output)
	
	statsDir = os.path.dirname(args.FastqDirectory[:-1]) + '/PipelineStats/'
	if args.statdir:
		statsDir = common.fixDirName(args.statdir)

	tempDir = args.FastqDirectory + 'Temp/'

	for i in [samDir, tempDir, statsDir]:
		common.makeDir(i)		

	fastqFiles = common.getSampleList(args.FastqDirectory, args.samples, 'fastq')
	
	

	#run multiprocessing of all mapping commands#
	argList = [(x, args.species, args.trim, statsDir, tempDir, samDir) for x in fastqFiles]
		
	common.daemon(mapfile.runOne, argList, 'map fastq files', cpuPerProcess=8)


	
	#remove all temporary files#
	shutil.rmtree(tempDir[:-1])
  
  
	
  	print('\nMapping complete\n\n\n')
Beispiel #4
0
def runAll(args):

    print('\n\n\nYou have requested to map fastq files')
    print('\tWARNING:')
    print('\t\tPLEASE MAKE SURE YOU ARE USING')
    print('\t\t\tBowtie v1 and Samtools v0.1.19')
    print('\t\t\tBowtie v1 mapping indexes for either mm10 or hg38')
    print('\n')

    #set up environment#
    args.FastqDirectory = common.fixDirName(args.FastqDirectory)

    samDir = os.path.dirname(args.FastqDirectory[:-1]) + '/Sam/'
    if args.output:
        samDir = common.fixDirName(args.output)

    statsDir = os.path.dirname(args.FastqDirectory[:-1]) + '/PipelineStats/'
    if args.statdir:
        statsDir = common.fixDirName(args.statdir)

    tempDir = args.FastqDirectory + 'Temp/'

    for i in [samDir, tempDir, statsDir]:
        common.makeDir(i)

    fastqFiles = common.getSampleList(args.FastqDirectory, args.samples,
                                      'fastq')

    #run multiprocessing of all mapping commands#
    argList = [(x, args.MapIndex, args.trim, statsDir, tempDir, samDir,
                args.bowtie, args.samtools) for x in fastqFiles]
    common.daemon(mapfile.runOne, argList, 'map fastq files', cpuPerProcess=8)

    #remove all temporary files#
    shutil.rmtree(tempDir[:-1])

    print('\nMapping complete\n\n\n')
Beispiel #5
0
def runAll(args):

    print('\n\n\nYou have requested to normalize and segment bincounts files')
    print('\tWARNING:')
    print(
        '\t\tIF USING ANY REFERENCES OTHER THAN THOSE I PROVIDE I CANNOT GUARANTEE RESULT ACCURACY'
    )
    print('\n')

    #Set up environment#
    args.AnalysisDirectory = common.fixDirName(args.AnalysisDirectory)

    CountDir = args.AnalysisDirectory + 'BinCounts/'  #args.CountDirectory = common.fixDirName(args.CountDirectory)
    if args.bincountdir:
        CountDir = common.fixDirName(args.bincountdir)

    lowessDir = args.AnalysisDirectory + 'LowessBinCounts/'  #os.path.dirname(args.CountDirectory[:-1]) + '/LowessBinCounts/'
    segmentDir = args.AnalysisDirectory + 'Segments/'  #os.path.dirname(args.CountDirectory[:-1]) + '/Segments/'
    tempDir = args.AnalysisDirectory + 'Temp/'  #os.path.dirname(args.CountDirectory[:-1]) + '/Temp/'

    common.makeDir(lowessDir)
    if not args.normalizeonly:
        common.makeDir(segmentDir)
        common.makeDir(tempDir)

    sampleFiles = common.getSampleList(CountDir, args.samples, 'bincounts')

    info = common.importInfoFile(args.infofile, args.columns, 'normalize')

    if args.infofile:
        refArray = info
    else:
        thisDtype = info
        refArray = np.array([(
            os.path.basename(x)[:-14],
            'unk',
            1,
        ) for x in sampleFiles],
                            dtype=thisDtype)

    sampleDict = {
        x: [y for y in sampleFiles if x == os.path.basename(y)[:len(x)]][0]
        for x in refArray['name']
    }

    #Run normalization for all samples#
    methodDict = {x: [
        False,
    ]
                  for x in np.unique(refArray['method'])}
    methodDict['NA'] = [False]
    sampleNormMethodDict = {x: 'NA' for x in refArray['name']}

    if not args.gconly:
        for i in methodDict:
            refSlice = refArray[(refArray['method'] == i)
                                & (refArray['cells'] == 1)]
            methodSamples = [sampleDict[x] for x in refSlice['name']]

            methodDict[i] = normalizefile.runMakeMethodRef(
                args.species, methodSamples, i, lowessDir)

            if methodDict[i][0] != False:
                for j in refSlice['name']:
                    sampleNormMethodDict[j] = i

    #run multiprocessing for gc (+ method) correction
    normArgs = [
        (args.species, sampleDict[x], methodDict[sampleNormMethodDict[x]],
         lowessDir + x + '.lowess.txt') for x in sampleDict.keys()
    ]
    common.daemon(normalizefile.runNormalizeOne, normArgs,
                  'normalize bincount files')

    print('\nNormalization complete\n\n\n')

    #Run CBS for all samples#
    if not args.normalizeonly:
        segArgs = [(x, args.species, tempDir, lowessDir, segmentDir)
                   for x in refArray['name']]
        common.daemon(segmentfile.segmentOne, segArgs, 'segment bincount data')

    shutil.rmtree(tempDir[:-1])

    print('\nSegmentation complete\n\n\n')
def runAll(args):

    print('\n\n\nYou have requested to analyze CNV call data')
    print('\tWARNING:')
    print(
        '\t\tIF USING ANY REFERENCES OTHER THAN THOSE I PROVIDE I CANNOT GUARANTEE RESULT ACCURACY'
    )
    print('\n')

    #Set up environment#
    args.AnalysisDirectory = common.fixDirName(args.AnalysisDirectory)

    folderDict = {
        'LowessBinCounts': args.lowess,
        'Segments': args.segments,
        'PipelineStats': args.countstats
    }

    for i in list(folderDict.keys()):
        if not folderDict[i]:
            folderDict[i] = args.AnalysisDirectory + i + '/'
        else:
            folderDict[i] = common.fixDirName(folderDict[i])

    QCdir = args.AnalysisDirectory + 'QC/'
    CNVdir = args.AnalysisDirectory + 'CNVlists/'
    summaryDir = args.AnalysisDirectory + 'SummaryFiles/'
    PloidyPlotDir = args.AnalysisDirectory + 'PloidyDeterminationPlots/'
    CNplotDir = args.AnalysisDirectory + 'CopyNumberProfilePlots/'
    ChromPlotDir = args.AnalysisDirectory + 'ChromosomeCopyNumberPlots/'

    for i in [
            args.AnalysisDirectory, QCdir, CNVdir, summaryDir, PloidyPlotDir,
            CNplotDir, ChromPlotDir
    ]:  #
        common.makeDir(i)

    #get list of samples to process
    #will involve checking infofile (if present) and whether required input files exist
    sampleFiles = common.getSampleList(folderDict['Segments'], args.samples,
                                       'segments')
    sampleNames = [x.split('/')[-1].split('.')[0] for x in sampleFiles]

    #	info = common.importInfoFile(args.infofile, args.columns, 'interpret')
    #	if args.infofile:
    #		refArray = info
    #	else:
    #		thisDtype = info
    #		refArray = np.array(
    #			[ (x, 1, 'unk',) for x in sampleNames],
    #			dtype=thisDtype)

    #QC assessment#
    #	qcfile.runQCone(sampleNames[0], args.species, folderDict['PipelineStats'], folderDict['LowessBinCounts'], folderDict['Segments'], QCdir, PloidyPlotDir)
    argList = [(x, args.species, folderDict['PipelineStats'],
                folderDict['LowessBinCounts'], folderDict['Segments'], QCdir,
                PloidyPlotDir) for x in sampleNames]
    common.daemon(qcfile.runQCone, argList, 'assess sample quality')

    analysisSamples = []
    ploidyDict = {}
    genderDict = {}

    mergeQCfile = summaryDir + 'QCmetrics.txt'
    OUT = open(mergeQCfile, 'w')
    OUT.write('Name\tReads\tMAPD\tCS\tPloidy\tGender\tPASS\n')

    for i in sampleNames:
        IN = open(QCdir + i + '.qcTEMP.txt', 'r')
        data = IN.readline()
        OUT.write(data)

        data = data.rstrip().split('\t')
        if data[-1] == 'True':
            analysisSamples.append(i)
            ploidyDict[i] = float(data[4])
            genderDict[i] = data[-2]

        IN.close()
        os.remove(QCdir + i + '.qcTEMP.txt')

    OUT.close()
    os.rmdir(QCdir)

    #FUnC: CNV filtering#
    if args.nofilter:
        print '\nFURTHER CODE IS ONLY DEVELOPED FOR WHEN FUnC IS IMPLEMENTED, EXITING NOW\n\n\n'
        raise SystemExit

#	funcfile.FUnCone(analysisSamples[0], args.species, folderDict['Segments'], CNVdir,
#			 ploidyDict[analysisSamples[0]], genderDict[analysisSamples[0]])
    argList = [(x, args.species, folderDict['Segments'], CNVdir, ploidyDict[x],
                genderDict[x]) for x in analysisSamples]
    common.daemon(funcfile.FUnCone, argList, 'remove unreliable CNV calls')

    #CNV analysis#
    #	summaryStats = analyzefiles.analyzeOne(analysisSamples[0], args.species, CNVdir, folderDict['LowessBinCounts'], CNplotDir, ChromPlotDir, ploidyDict[analysisSamples[0]], genderDict[analysisSamples[0]])
    #	summaryStats = [summaryStats]
    argList = [(x, args.species, CNVdir, folderDict['LowessBinCounts'],
                CNplotDir, ChromPlotDir, ploidyDict[x], genderDict[x])
               for x in analysisSamples]
    summaryStats = common.daemon(analyzefiles.analyzeOne, argList,
                                 'create summary files')

    cellStatsFile = summaryDir + 'CellStats.txt'
    chromAmpFile = summaryDir + 'ChromosomeAmplifiedPercent.txt'
    chromDelFile = summaryDir + 'ChromosomeDeletedPercent.txt'

    #write summary statistics files#
    with open(cellStatsFile,
              'w') as CELL, open(chromAmpFile,
                                 'w') as AMP, open(chromDelFile, 'w') as DEL:
        CELL.write(
            'Sample\tDeletionNumber\tAmplificationNumber\tTotalCNVnumber\tDeletedMB\tAmplifiedMB\tNetDNAalterdMB\n'
        )
        chromHeader = 'Sample\t' + '\t'.join(summaryStats[0]['chroms']) + '\n'
        AMP.write(chromHeader)
        DEL.write(chromHeader)

        for i, j in enumerate(analysisSamples):
            CELL.write(str(j + '\t'))
            cellOut = [
                summaryStats[i]['cellStats']['delCount'],
                summaryStats[i]['cellStats']['ampCount'],
                summaryStats[i]['cellStats']['delCount'] +
                summaryStats[i]['cellStats']['ampCount'],
                np.round(summaryStats[i]['cellStats']['delMB'], 3),
                np.round(summaryStats[i]['cellStats']['ampMB'], 3),
                np.round(
                    summaryStats[i]['cellStats']['ampMB'] -
                    summaryStats[i]['cellStats']['delMB'], 3)
            ]
            cellOut = '\t'.join(map(str, cellOut)) + '\n'
            CELL.write(cellOut)

            AMP.write(str(j + '\t'))
            ampOut = [
                np.round(summaryStats[i]['chromAmp'][x], 3)
                for x in summaryStats[0]['chroms']
            ]
            ampOut = '\t'.join(map(str, ampOut)) + '\n'
            AMP.write(ampOut)

            DEL.write(str(j + '\t'))
            delOut = [
                np.round(summaryStats[i]['chromDel'][x], 3)
                for x in summaryStats[0]['chroms']
            ]
            delOut = '\t'.join(map(str, delOut)) + '\n'
            DEL.write(delOut)

    print('\nCNV analysis complete\n\n\n')