Beispiel #1
0
def pivotTable(covstats_files, out_dir):
    '''make pivot-table display from bbmap genotyping results'''

    # expect list of covstats.txt files from mapReads

    # make concatentated dataframe
    df = pd.concat((pd.read_csv(f, sep='\t', dtype={'Avg_fold': 'float'})
                    for f in covstats_files))

    # change Avg_fold data to float and round to one decimal
    df.Avg_fold = df.Avg_fold.round(1)

    # rename columns in dataframe
    df.rename(index=str,
              columns={
                  '#ID': 'allele',
                  'Avg_fold': 'read_count'
              },
              inplace=True)

    # pivot data
    pivoted = pd.pivot_table(df,
                             values='read_count',
                             index=['allele'],
                             columns='sample_name').reset_index()

    # create output genotyping folder if it doesn't exist
    genotyping_dir = utils.createOutputFolder(out_dir + '/genotyping/')

    # create CSV output files
    df.to_csv(genotyping_dir + '/genotyping_list.tsv', sep='\t')
    pivoted.to_csv(genotyping_dir + '/genotyping_pivot.tsv', sep='\t')

    # import data to labkey
    importLabkey(df)
Beispiel #2
0
def mapReads(in_fastq, ref_fasta, out_dir, experiment):
    '''use mapPacBio.sh from bbmap to identify reference sequenecs matched by one or more PacBio reads with no substitutions (indels allowed)'''

    # mapPacBio path (first part gets path to folder running script)
    bbmap_pacbio = (os.path.dirname(
        os.path.realpath(__file__))) + '/bbmap_37_28/mapPacBio.sh'

    # get sample name from input file
    # need to strip off .gz and .fastq extensions sequentially

    sample_name = os.path.splitext(
        os.path.splitext(os.path.basename(in_fastq))[0])[0]
    print('Sample name: ' + sample_name)

    # create output genotyping folder if it doesn't exist
    sample_dir = utils.createOutputFolder(out_dir + '/genotyping/' +
                                          sample_name)

    # create bbmap command
    cmd = [
        bbmap_pacbio, 'in=' + in_fastq, 'ref=' + ref_fasta,
        'covstats=' + sample_dir + '/' + sample_name + '.covstats.tmp.txt',
        'outm=' + sample_dir + '/' + sample_name + '.mapped.bam',
        'outu=' + sample_dir + '/' + sample_name + '.unmapped.fastq.gz',
        'statsfile=' + sample_dir + '/' + sample_name + '.mapping_stats.txt',
        'subfilter=0', 'nzo=t', 'ambiguous=all', 'maxlen=1500', 'minid=0.9',
        'maxindel=10', 'minratio=0.8', 'twocolumn=t', 'ow=t'
    ]

    # print bbmap command
    status.printStatus(' '.join(cmd))

    # call bbmap
    # suppress stats output (saved to file, no need to clutter stderr)
    # FNULL = open(os.devnull, 'w')
    subprocess.call(cmd)
    # FNULL.close()

    # add descriptors to covstats output
    with open(sample_dir + '/' + sample_name + '.covstats.tmp.txt', 'r') as f:
        with open(sample_dir + '/' + sample_name + '.covstats.txt', 'w') as g:
            for idx, line in enumerate(f):
                # print header in first line, otherwise value of sample_name
                if idx == 0:
                    g.write('sample_name' + '\t' + line.rstrip('\n') + '\t' +
                            'ref_fasta\tanalysis_path\texperiment\n')
                else:
                    g.write(sample_name + '\t' + line.rstrip('\n') + '\t' +
                            ref_fasta + '\t' + out_dir + '\t' + experiment +
                            '\n')

    # remove temporary covstats.tmp.txt file after covstats.txt with sample ID prepared
    if os.path.exists(sample_dir + '/' + sample_name + '.covstats.tmp.txt'):
        os.remove(sample_dir + '/' + sample_name + '.covstats.tmp.txt')

    # copy reference file to output folder
    copyfile(ref_fasta, out_dir + '/genotyping/' + os.path.basename(ref_fasta))

    # return covstats file
    return sample_dir + '/' + sample_name + '.covstats.txt'
def makeCcs(subreads,
            out_dir,
            minPredictedAccuracy='0.9',
            minLength='1000',
            maxLength='1500'):
    '''use smrtlink ccs to produce consensus sequence'''

    # path to smrtlink ccs
    smrtlink_ccs_path = '/slipstream/SMRT4/SMRT/smrtcmds/bin/ccs'

    # check that subreads file exists
    if os.path.exists(subreads) == False:
        status.printStatus(
            'Error: Specified subread file does not exist. Check your file path and try again.'
        )
        return

    # filename of input file
    subreads_basename = os.path.splitext(os.path.basename(subreads))[0]
    print(subreads_basename)

    # create output directory if it doesn't exist
    utils.createOutputFolder(out_dir)

    # call ccs
    cmd = [
        smrtlink_ccs_path, '--minPredictedAccuracy', minPredictedAccuracy,
        '--minLength', minLength, '--maxLength', maxLength, subreads,
        out_dir + '/' + subreads_basename + '.ccs.bam'
    ]

    status.printStatus('CCS command: ' + ' '.join(cmd))
    status.printStatus('CCS processing of ' + subreads + ' started')
    subprocess.call(cmd)
    status.printStatus('CCS processing of ' + subreads + ' completed')
    status.printStatus('Output CCS file saved to ' + out_dir + '/' +
                       subreads_basename + '.ccs.bam')

    # create fastq file
    fastq_path = makeFastq(out_dir + '/' + subreads_basename + '.ccs.bam')
    return fastq_path
Beispiel #4
0
def runLongAmpliconAnalysis(subreadsetXML,
                            whitelistSequences,
                            outputPrefix,
                            minLength='1000',
                            maxLength='1500',
                            maxReads='20000',
                            maxClusteringReads='5000'):
    '''run SMRT Link v5 long amplicon analysis'''

    # runs LAA to generate amplicon sequences from PacBio Sequel data
    # subreadsetXML can be from a single dataset, or merged datasets where new XML files are created using dataset create
    # whitelistFasta is a file containing sequences that will be analyzed by LAA, typically sequences from a single sample
    # defaults are set for typical MHC class I genotyping and should be adjusted depending on target
    # note: LAA default minLength=3000 will cause most of our analyses to fail so minLength should almost always be set
    # increasing maxClusteringReads will allow more alleles to be detected at the expense of speed:
    # LAA default of 500 clustering reads runs each sample in ~2 minutes, MHC class I default of 10000 takes ~30 minutes
    # but detects more alleles. Setting even higher values like 100,000 clustering reads causes runtimes of several hours.
    # maxReads can be set very high to ensure that all reads are used to accurately define clusters. This doesn't significantly
    # impact runtime.

    # use outputPrefix to specify the folder and prefix for output files
    # eg '/slipstream/shared_data/19364/09/'
    # eg '/slipstream/shared_data/19364/09/BM115.

    # path to SMRT Link v6.0 LAA
    laa_path = '/slipstream/oc/pacbio/smrtlink_v6/smrtcmds/bin/laa'

    # create output folder if it doesn't exist
    utils.createOutputFolder(os.path.dirname(outputPrefix))

    # create laa command
    laa_cmd = [
        laa_path, '--whitelist=' + whitelistSequences,
        '--logFile=' + outputPrefix + '.log.txt',
        '--resultFile=' + outputPrefix + '.amplicon_analysis.fastq',
        '--junkFile=' + outputPrefix +
        '.amplicon_analysis_chimeras_noise.fastq',
        '--reportFile=' + outputPrefix + '.amplicon_analysis_summary.csv',
        '--inputReportFile=' + outputPrefix + '.amplicon_analysis_input.csv',
        '--subreadsReportPrefix=' + outputPrefix +
        '.amplicon_analysis_subreads', subreadsetXML
    ]
    print(laa_cmd)
    #               '--minLength=' + minLength,
    #               '--maxLength=' + maxLength,
    #               '--maxReads=' + maxReads,
    #               '--maxClusteringReads=' + maxClusteringReads,'--whitelist=' + whitelistSequences,
    #               '--logFile=' + outputPrefix + '.log.txt',
    #               '--resultFile=' + outputPrefix + '.amplicon_analysis.fastq',
    #               '--junkFile=' + outputPrefix + '.amplicon_analysis_chimeras_noise.fastq',
    #               '--reportFile=' + outputPrefix + '.amplicon_analysis_summary.csv',
    #               '--inputReportFile=' + outputPrefix + '.amplicon_analysis_input.csv',
    #               '--subreadsReportPrefix=' + outputPrefix + '.amplicon_analysis_subreads',
    #               subreadsetXML]
    #    laa_cmd = [laa_path,
    #               '--minLength=' + minLength,
    #               '--maxLength=' + maxLength,
    #               '--maxReads=' + maxReads,
    #               '--maxClusteringReads=' + maxClusteringReads,
    #               '--whitelist=' + whitelistSequences,
    #               '--logFile=' + outputPrefix + '.log.txt',
    #               '--resultFile=' + outputPrefix + '.amplicon_analysis.fastq',
    #               '--junkFile=' + outputPrefix + '.amplicon_analysis_chimeras_noise.fastq',
    #               '--reportFile=' + outputPrefix + '.amplicon_analysis_summary.csv',
    #               '--inputReportFile=' + outputPrefix + '.amplicon_analysis_input.csv',
    #               '--subreadsReportPrefix=' + outputPrefix + '.amplicon_analysis_subreads',
    #               subreadsetXML]

    # print laa command
    status.printStatus(' '.join(laa_cmd))

    # call laa
    subprocess.call(laa_cmd)

    # return path to LAA fastq output
    return outputPrefix + 'amplicon_analysis.fastq'
Beispiel #5
0
if __name__ == '__main__':  # if run directly from the command line
    # command line parameters
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("out_dir",
                        help='Folder that will store all output files')
    parser.add_argument(
        "fastq_folder",
        help='Path to folder containing FASTQ files to genotype')
    parser.add_argument(
        "ref_fasta", help='Path to reference FASTA file to map reads against')
    parser.add_argument("experiment", help='Experiment number')
    args = parser.parse_args()

    # make output folder if it doesn't exist
    utils.createOutputFolder(args.out_dir)

    # configure log to stdout
    logging.basicConfig(filename=args.out_dir + '/log.txt',
                        filemode='w',
                        level=logging.DEBUG,
                        format='%(asctime)s %(message)s',
                        datefmt='%Y-%m-%d %I:%M:%S')

    # log command line
    status.printStatus('Command line statment: ' + ' '.join(sys.argv))

    # map reads and summarize results
    mapReadsFolder(args.fastq_folder, args.ref_fasta, args.out_dir,
                   args.experiment)
Beispiel #6
0
def parseBarcodes(samples, input_ccs_fastq, out_dir):
    '''parse barcodes from gzip-compressed FASTQ of PacBio CCS reads'''

    # create output directory if it doesn't exist
    utils.createOutputFolder(out_dir)

    # create PacBio barcode dictionary to lookup against
    pacbioLookup = pacbioBarcodeDict()

    # create dictionary of sample IDs and barcode sequences
    searchDict = {}
    for seq_name, barcode_seqs in samples.items():
        searchDict[seq_name] = [pacbioLookup[barcode_seqs[0]], pacbioLookup[barcode_seqs[1]]]

    # open gzip-compressed FASTQ
    with gzip.open(input_ccs_fastq, "rt") as handle:

        # make dictionary to hold barcode-split seq records
        perBarcodeDict = {}

        # initialize dictionary with names of each sample
        for j in searchDict:
            perBarcodeDict[j]=[]

        # log every 1000 sequences processed
        log_every_n = 1000

        # iterate through generator containing FASTQ sequences
        for idx, i in enumerate(SeqIO.parse(handle, "fastq")):

            # print status message every 1000 sequences processed
            if (idx % log_every_n) == 0:
                status.printStatus(str(idx) + ' FASTQ reads demultiplexed')

            # for each sequence, look for the presence of barcodes at the start and end
            for j in searchDict:
                # redo to use re.search to find barcodes not at very end of sequence
                # if i.seq.startswith(searchDict[j][0]) and i.seq.endswith(searchDict[j][1]):

                # regular expression to find barcodes in forward orientation
                prog = re.compile(searchDict[j][0] + ('.*') + searchDict[j][1])

                # test if regular expression is found in sequence
                # need to cast i.seq to string to use re.search

                if prog.search(str(i.seq)):
                    # write matching barcodes to perBarcodeDict - store in memory
                    x = perBarcodeDict[j]
                    x.append(i)
                    perBarcodeDict[j]= x

                # handle inserts in the opposite orientation
                # create Biopython sequence object containing barcode sequences
                forward_seq = Seq(searchDict[j][0])
                reverse_seq = Seq(searchDict[j][1])

                # reverse complement
                forward_seq_rc = forward_seq.reverse_complement()
                reverse_seq_rc = reverse_seq.reverse_complement()

                # find FASTQ sequences matching reverse complemented barcodes
                # if i.seq.startswith(forward_seq_rc) and i.seq.endswith(reverse_seq_rc):

                # because of the SMRTBell orientation, second barcode gets listed first in reverse complement orientation
                prog = re.compile(str(reverse_seq_rc) + '.*' + str(forward_seq_rc))

                # need to cast i.seq to string to use re.search
                if prog.search(str(i.seq)):
                    # store matches in dictionary
                    x = perBarcodeDict[j]
                    x.append(i)
                    perBarcodeDict[j]= x

        # write output files containing reads matching each barcode
        for i in perBarcodeDict:
            count = SeqIO.write(perBarcodeDict[i], out_dir + '/' + i + '.fastq', 'fastq')

            # compress fastq file and remove uncompressed version

            with open(out_dir + '/' + i + '.fastq', 'rb') as f_in:
                with gzip.open(out_dir + '/' + i + '.fastq.gz', 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

            os.remove(out_dir + '/' + i + '.fastq') # remove uncompressed

            # log
            status.printStatus(str(count) + ' barcoded reads saved from sample ' + i )
            status.printStatus('gzip-compressed demultipled FASTQ file saved to ' + out_dir + '/' + i + '.fastq.gz')