def pivotTable(covstats_files, out_dir): '''make pivot-table display from bbmap genotyping results''' # expect list of covstats.txt files from mapReads # make concatentated dataframe df = pd.concat((pd.read_csv(f, sep='\t', dtype={'Avg_fold': 'float'}) for f in covstats_files)) # change Avg_fold data to float and round to one decimal df.Avg_fold = df.Avg_fold.round(1) # rename columns in dataframe df.rename(index=str, columns={ '#ID': 'allele', 'Avg_fold': 'read_count' }, inplace=True) # pivot data pivoted = pd.pivot_table(df, values='read_count', index=['allele'], columns='sample_name').reset_index() # create output genotyping folder if it doesn't exist genotyping_dir = utils.createOutputFolder(out_dir + '/genotyping/') # create CSV output files df.to_csv(genotyping_dir + '/genotyping_list.tsv', sep='\t') pivoted.to_csv(genotyping_dir + '/genotyping_pivot.tsv', sep='\t') # import data to labkey importLabkey(df)
def mapReads(in_fastq, ref_fasta, out_dir, experiment): '''use mapPacBio.sh from bbmap to identify reference sequenecs matched by one or more PacBio reads with no substitutions (indels allowed)''' # mapPacBio path (first part gets path to folder running script) bbmap_pacbio = (os.path.dirname( os.path.realpath(__file__))) + '/bbmap_37_28/mapPacBio.sh' # get sample name from input file # need to strip off .gz and .fastq extensions sequentially sample_name = os.path.splitext( os.path.splitext(os.path.basename(in_fastq))[0])[0] print('Sample name: ' + sample_name) # create output genotyping folder if it doesn't exist sample_dir = utils.createOutputFolder(out_dir + '/genotyping/' + sample_name) # create bbmap command cmd = [ bbmap_pacbio, 'in=' + in_fastq, 'ref=' + ref_fasta, 'covstats=' + sample_dir + '/' + sample_name + '.covstats.tmp.txt', 'outm=' + sample_dir + '/' + sample_name + '.mapped.bam', 'outu=' + sample_dir + '/' + sample_name + '.unmapped.fastq.gz', 'statsfile=' + sample_dir + '/' + sample_name + '.mapping_stats.txt', 'subfilter=0', 'nzo=t', 'ambiguous=all', 'maxlen=1500', 'minid=0.9', 'maxindel=10', 'minratio=0.8', 'twocolumn=t', 'ow=t' ] # print bbmap command status.printStatus(' '.join(cmd)) # call bbmap # suppress stats output (saved to file, no need to clutter stderr) # FNULL = open(os.devnull, 'w') subprocess.call(cmd) # FNULL.close() # add descriptors to covstats output with open(sample_dir + '/' + sample_name + '.covstats.tmp.txt', 'r') as f: with open(sample_dir + '/' + sample_name + '.covstats.txt', 'w') as g: for idx, line in enumerate(f): # print header in first line, otherwise value of sample_name if idx == 0: g.write('sample_name' + '\t' + line.rstrip('\n') + '\t' + 'ref_fasta\tanalysis_path\texperiment\n') else: g.write(sample_name + '\t' + line.rstrip('\n') + '\t' + ref_fasta + '\t' + out_dir + '\t' + experiment + '\n') # remove temporary covstats.tmp.txt file after covstats.txt with sample ID prepared if os.path.exists(sample_dir + '/' + sample_name + '.covstats.tmp.txt'): os.remove(sample_dir + '/' + sample_name + '.covstats.tmp.txt') # copy reference file to output folder copyfile(ref_fasta, out_dir + '/genotyping/' + os.path.basename(ref_fasta)) # return covstats file return sample_dir + '/' + sample_name + '.covstats.txt'
def makeCcs(subreads, out_dir, minPredictedAccuracy='0.9', minLength='1000', maxLength='1500'): '''use smrtlink ccs to produce consensus sequence''' # path to smrtlink ccs smrtlink_ccs_path = '/slipstream/SMRT4/SMRT/smrtcmds/bin/ccs' # check that subreads file exists if os.path.exists(subreads) == False: status.printStatus( 'Error: Specified subread file does not exist. Check your file path and try again.' ) return # filename of input file subreads_basename = os.path.splitext(os.path.basename(subreads))[0] print(subreads_basename) # create output directory if it doesn't exist utils.createOutputFolder(out_dir) # call ccs cmd = [ smrtlink_ccs_path, '--minPredictedAccuracy', minPredictedAccuracy, '--minLength', minLength, '--maxLength', maxLength, subreads, out_dir + '/' + subreads_basename + '.ccs.bam' ] status.printStatus('CCS command: ' + ' '.join(cmd)) status.printStatus('CCS processing of ' + subreads + ' started') subprocess.call(cmd) status.printStatus('CCS processing of ' + subreads + ' completed') status.printStatus('Output CCS file saved to ' + out_dir + '/' + subreads_basename + '.ccs.bam') # create fastq file fastq_path = makeFastq(out_dir + '/' + subreads_basename + '.ccs.bam') return fastq_path
def runLongAmpliconAnalysis(subreadsetXML, whitelistSequences, outputPrefix, minLength='1000', maxLength='1500', maxReads='20000', maxClusteringReads='5000'): '''run SMRT Link v5 long amplicon analysis''' # runs LAA to generate amplicon sequences from PacBio Sequel data # subreadsetXML can be from a single dataset, or merged datasets where new XML files are created using dataset create # whitelistFasta is a file containing sequences that will be analyzed by LAA, typically sequences from a single sample # defaults are set for typical MHC class I genotyping and should be adjusted depending on target # note: LAA default minLength=3000 will cause most of our analyses to fail so minLength should almost always be set # increasing maxClusteringReads will allow more alleles to be detected at the expense of speed: # LAA default of 500 clustering reads runs each sample in ~2 minutes, MHC class I default of 10000 takes ~30 minutes # but detects more alleles. Setting even higher values like 100,000 clustering reads causes runtimes of several hours. # maxReads can be set very high to ensure that all reads are used to accurately define clusters. This doesn't significantly # impact runtime. # use outputPrefix to specify the folder and prefix for output files # eg '/slipstream/shared_data/19364/09/' # eg '/slipstream/shared_data/19364/09/BM115. # path to SMRT Link v6.0 LAA laa_path = '/slipstream/oc/pacbio/smrtlink_v6/smrtcmds/bin/laa' # create output folder if it doesn't exist utils.createOutputFolder(os.path.dirname(outputPrefix)) # create laa command laa_cmd = [ laa_path, '--whitelist=' + whitelistSequences, '--logFile=' + outputPrefix + '.log.txt', '--resultFile=' + outputPrefix + '.amplicon_analysis.fastq', '--junkFile=' + outputPrefix + '.amplicon_analysis_chimeras_noise.fastq', '--reportFile=' + outputPrefix + '.amplicon_analysis_summary.csv', '--inputReportFile=' + outputPrefix + '.amplicon_analysis_input.csv', '--subreadsReportPrefix=' + outputPrefix + '.amplicon_analysis_subreads', subreadsetXML ] print(laa_cmd) # '--minLength=' + minLength, # '--maxLength=' + maxLength, # '--maxReads=' + maxReads, # '--maxClusteringReads=' + maxClusteringReads,'--whitelist=' + whitelistSequences, # '--logFile=' + outputPrefix + '.log.txt', # '--resultFile=' + outputPrefix + '.amplicon_analysis.fastq', # '--junkFile=' + outputPrefix + '.amplicon_analysis_chimeras_noise.fastq', # '--reportFile=' + outputPrefix + '.amplicon_analysis_summary.csv', # '--inputReportFile=' + outputPrefix + '.amplicon_analysis_input.csv', # '--subreadsReportPrefix=' + outputPrefix + '.amplicon_analysis_subreads', # subreadsetXML] # laa_cmd = [laa_path, # '--minLength=' + minLength, # '--maxLength=' + maxLength, # '--maxReads=' + maxReads, # '--maxClusteringReads=' + maxClusteringReads, # '--whitelist=' + whitelistSequences, # '--logFile=' + outputPrefix + '.log.txt', # '--resultFile=' + outputPrefix + '.amplicon_analysis.fastq', # '--junkFile=' + outputPrefix + '.amplicon_analysis_chimeras_noise.fastq', # '--reportFile=' + outputPrefix + '.amplicon_analysis_summary.csv', # '--inputReportFile=' + outputPrefix + '.amplicon_analysis_input.csv', # '--subreadsReportPrefix=' + outputPrefix + '.amplicon_analysis_subreads', # subreadsetXML] # print laa command status.printStatus(' '.join(laa_cmd)) # call laa subprocess.call(laa_cmd) # return path to LAA fastq output return outputPrefix + 'amplicon_analysis.fastq'
if __name__ == '__main__': # if run directly from the command line # command line parameters import argparse parser = argparse.ArgumentParser() parser.add_argument("out_dir", help='Folder that will store all output files') parser.add_argument( "fastq_folder", help='Path to folder containing FASTQ files to genotype') parser.add_argument( "ref_fasta", help='Path to reference FASTA file to map reads against') parser.add_argument("experiment", help='Experiment number') args = parser.parse_args() # make output folder if it doesn't exist utils.createOutputFolder(args.out_dir) # configure log to stdout logging.basicConfig(filename=args.out_dir + '/log.txt', filemode='w', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S') # log command line status.printStatus('Command line statment: ' + ' '.join(sys.argv)) # map reads and summarize results mapReadsFolder(args.fastq_folder, args.ref_fasta, args.out_dir, args.experiment)
def parseBarcodes(samples, input_ccs_fastq, out_dir): '''parse barcodes from gzip-compressed FASTQ of PacBio CCS reads''' # create output directory if it doesn't exist utils.createOutputFolder(out_dir) # create PacBio barcode dictionary to lookup against pacbioLookup = pacbioBarcodeDict() # create dictionary of sample IDs and barcode sequences searchDict = {} for seq_name, barcode_seqs in samples.items(): searchDict[seq_name] = [pacbioLookup[barcode_seqs[0]], pacbioLookup[barcode_seqs[1]]] # open gzip-compressed FASTQ with gzip.open(input_ccs_fastq, "rt") as handle: # make dictionary to hold barcode-split seq records perBarcodeDict = {} # initialize dictionary with names of each sample for j in searchDict: perBarcodeDict[j]=[] # log every 1000 sequences processed log_every_n = 1000 # iterate through generator containing FASTQ sequences for idx, i in enumerate(SeqIO.parse(handle, "fastq")): # print status message every 1000 sequences processed if (idx % log_every_n) == 0: status.printStatus(str(idx) + ' FASTQ reads demultiplexed') # for each sequence, look for the presence of barcodes at the start and end for j in searchDict: # redo to use re.search to find barcodes not at very end of sequence # if i.seq.startswith(searchDict[j][0]) and i.seq.endswith(searchDict[j][1]): # regular expression to find barcodes in forward orientation prog = re.compile(searchDict[j][0] + ('.*') + searchDict[j][1]) # test if regular expression is found in sequence # need to cast i.seq to string to use re.search if prog.search(str(i.seq)): # write matching barcodes to perBarcodeDict - store in memory x = perBarcodeDict[j] x.append(i) perBarcodeDict[j]= x # handle inserts in the opposite orientation # create Biopython sequence object containing barcode sequences forward_seq = Seq(searchDict[j][0]) reverse_seq = Seq(searchDict[j][1]) # reverse complement forward_seq_rc = forward_seq.reverse_complement() reverse_seq_rc = reverse_seq.reverse_complement() # find FASTQ sequences matching reverse complemented barcodes # if i.seq.startswith(forward_seq_rc) and i.seq.endswith(reverse_seq_rc): # because of the SMRTBell orientation, second barcode gets listed first in reverse complement orientation prog = re.compile(str(reverse_seq_rc) + '.*' + str(forward_seq_rc)) # need to cast i.seq to string to use re.search if prog.search(str(i.seq)): # store matches in dictionary x = perBarcodeDict[j] x.append(i) perBarcodeDict[j]= x # write output files containing reads matching each barcode for i in perBarcodeDict: count = SeqIO.write(perBarcodeDict[i], out_dir + '/' + i + '.fastq', 'fastq') # compress fastq file and remove uncompressed version with open(out_dir + '/' + i + '.fastq', 'rb') as f_in: with gzip.open(out_dir + '/' + i + '.fastq.gz', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) os.remove(out_dir + '/' + i + '.fastq') # remove uncompressed # log status.printStatus(str(count) + ' barcoded reads saved from sample ' + i ) status.printStatus('gzip-compressed demultipled FASTQ file saved to ' + out_dir + '/' + i + '.fastq.gz')