Example #1
0
    def do_resequencing(self, data_file, amp_analysis_records, min_length,
                        whitelist_file):
        log.info('Resequencing supplied Amplicon Analysis sequences')
        output_dir = self.get_output_folder("amp_analysis_resequencing")

        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader(data_file)
        baxh5_file = os.path.join(self.output, 'baxh5.fofn')
        create_baxh5_fofn(data_file, baxh5_file)

        # Extract any consensus sequences associated with this barcode
        log.info('Identified {0} consensus sequences to resequence'.format(
            len(amp_analysis_records)))
        reference_file = os.path.join(output_dir, 'reference.fasta')
        write_records(amp_analysis_records, reference_file)

        # Resequence the selected consensus sequences with the selected ZMWs
        self.resequencer(baxh5_file,
                         whitelist_file,
                         reference_file,
                         output=output_dir,
                         min_length=min_length)
        log.info(
            "Finished resequencing supplied Amplicon Analysis sequences\n")
    def __call__(self,
                 amp_analysis,
                 data_file,
                 barcode_file,
                 barcode_string=None,
                 min_snr=None,
                 min_length=None):
        log.info(
            "Beginning Amplicon Analysis resequencing workflow for {0}".format(
                amp_analysis))

        # Pick or create a single file from AA and read it
        amp_analysis_file = get_input_file(amp_analysis)
        amp_analysis_records = list(AmpliconAnalysisReader(amp_analysis_file))

        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader(data_file)
        baxh5_file = os.path.join(self.output, 'baxh5.fofn')
        create_baxh5_fofn(data_file, baxh5_file)

        # Create a Reader for the Barcode data and find the overlap with any
        # barcodes specified by the user
        bc_reader = get_barcode_reader(barcode_file)
        bc_list = get_barcodes(bc_reader, barcode_string)

        for i, bc in enumerate(bc_list):
            log.info('Resequencing Barcode {0} (#{1} of {2})'.format(
                bc, i + 1, len(bc_list)))
            output_dir = self.get_output_folder(bc)

            # Extract any consensus sequences associated with this barcode
            record_list = [r for r in amp_analysis_records if r.barcode == bc]
            log.info(
                'Identified {0} consensus sequences for Barcode {1}'.format(
                    len(record_list), bc))
            filtered_records = [r for r in record_list if r.num_reads >= 20]
            unique_records = get_unique_records(filtered_records)
            fraction = 100 * round(
                len(unique_records) / float(len(record_list)), 3)
            log.info(
                '{0} of {1} ({2}%) consensus sequences passed all filters'.
                format(len(unique_records), len(record_list), fraction))
            reference_file = os.path.join(output_dir, 'reference.fasta')
            write_records(unique_records, reference_file)

            # Identify all high-quality, barcode-specific ZMWs and write them to file
            zmw_list = get_barcode_zmws(bc_reader, bc)
            zmw_list = filter_zmw_list(bash5, zmw_list, min_snr=min_snr)
            whitelist_file = os.path.join(output_dir, 'whitelist.txt')
            write_zmw_whitelist(bash5, zmw_list, whitelist_file)

            # Resequence the selected consensus sequences with the selected ZMWs
            self.resequencer(baxh5_file,
                             whitelist_file,
                             reference_file,
                             output=output_dir,
                             min_length=min_length)

            log.info("Finished resequencing Barcode {0}\n".format(bc))
Example #3
0
def _create_baxh5_fofn( input_file, output ):
    """
    Convert any BasH5 input files to BaxH5 to avoid file-type problems
    """
    log.info('Creating FOFN of Bax.H5 files')
    baxh5_fofn = os.path.join( output, 'baxh5.fofn' )
    if valid_file( baxh5_fofn ):
        log.info("Using existing Bax.H5 FOFN file")
        return baxh5_fofn

    log.info("No existing Bax.H5 fofn found")
    create_baxh5_fofn( input_file, baxh5_fofn )
    check_output_file( baxh5_fofn )
    log.info('Finished writing Bax.H5 fofn file\n')
    return baxh5_fofn
Example #4
0
def _create_baxh5_fofn(input_file, output):
    """
    Convert any BasH5 input files to BaxH5 to avoid file-type problems
    """
    log.info('Creating FOFN of Bax.H5 files')
    baxh5_fofn = os.path.join(output, 'baxh5.fofn')
    if valid_file(baxh5_fofn):
        log.info("Using existing Bax.H5 FOFN file")
        return baxh5_fofn

    log.info("No existing Bax.H5 fofn found")
    create_baxh5_fofn(input_file, baxh5_fofn)
    check_output_file(baxh5_fofn)
    log.info('Finished writing Bax.H5 fofn file\n')
    return baxh5_fofn
    def __call__(self, amp_analysis, data_file, barcode_file, barcode_string=None, min_snr=None, min_length=None):
        log.info("Beginning Amplicon Analysis resequencing workflow for {0}".format(amp_analysis))

        # Pick or create a single file from AA and read it
        amp_analysis_file = get_input_file( amp_analysis )
        amp_analysis_records = list(AmpliconAnalysisReader(amp_analysis_file))

        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader( data_file )
        baxh5_file = os.path.join( self.output, 'baxh5.fofn')
        create_baxh5_fofn( data_file, baxh5_file )

        # Create a Reader for the Barcode data and find the overlap with any
        # barcodes specified by the user
        bc_reader = get_barcode_reader( barcode_file )
        bc_list = get_barcodes( bc_reader, barcode_string )

        for i, bc in enumerate( bc_list ):
            log.info('Resequencing Barcode {0} (#{1} of {2})'.format(bc, i+1, len(bc_list)))
            output_dir = self.get_output_folder( bc )

            # Extract any consensus sequences associated with this barcode
            record_list = [r for r in amp_analysis_records if r.barcode == bc]
            log.info('Identified {0} consensus sequences for Barcode {1}'.format(len(record_list), bc))
            filtered_records = [r for r in record_list if r.num_reads >= 20]
            unique_records = get_unique_records( filtered_records )
            fraction = 100 * round(len(unique_records)/float(len(record_list)), 3)
            log.info('{0} of {1} ({2}%) consensus sequences passed all filters'.format(len(unique_records),
                                                                                       len(record_list),
                                                                                       fraction))
            reference_file = os.path.join( output_dir, 'reference.fasta' )
            write_records( unique_records, reference_file )

            # Identify all high-quality, barcode-specific ZMWs and write them to file
            zmw_list = get_barcode_zmws( bc_reader, bc )
            zmw_list = filter_zmw_list( bash5, zmw_list, min_snr=min_snr )
            whitelist_file = os.path.join( output_dir, 'whitelist.txt' )
            write_zmw_whitelist( bash5, zmw_list, whitelist_file )

            # Resequence the selected consensus sequences with the selected ZMWs
            self.resequencer( baxh5_file,
                              whitelist_file,
                              reference_file,
                              output=output_dir,
                              min_length=min_length )

            log.info("Finished resequencing Barcode {0}\n".format( bc ))
    def do_resequencing(self, data_file, amp_analysis_records, min_length, whitelist_file):
        log.info("Resequencing supplied Amplicon Analysis sequences")
        output_dir = self.get_output_folder("amp_analysis_resequencing")

        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader(data_file)
        baxh5_file = os.path.join(self.output, "baxh5.fofn")
        create_baxh5_fofn(data_file, baxh5_file)

        # Extract any consensus sequences associated with this barcode
        log.info("Identified {0} consensus sequences to resequence".format(len(amp_analysis_records)))
        reference_file = os.path.join(output_dir, "reference.fasta")
        write_records(amp_analysis_records, reference_file)

        # Resequence the selected consensus sequences with the selected ZMWs
        self.resequencer(baxh5_file, whitelist_file, reference_file, output=output_dir, min_length=min_length)
        log.info("Finished resequencing supplied Amplicon Analysis sequences\n")
    def do_barcoded_resequencing(
        self, data_file, amp_analysis_records, barcode_file, barcode_string, min_snr, min_length
    ):
        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader(data_file)
        baxh5_file = os.path.join(self.output, "baxh5.fofn")
        create_baxh5_fofn(data_file, baxh5_file)

        # Create a Reader for the Barcode data and find the overlap with any
        # barcodes specified by the user
        bc_reader = get_barcode_reader(barcode_file)
        bc_list = get_barcodes(bc_reader, barcode_string)

        for i, bc in enumerate(bc_list):
            log.info("Resequencing Barcode {0} (#{1} of {2})".format(bc, i + 1, len(bc_list)))
            output_dir = self.get_output_folder(bc)

            # Extract any consensus sequences associated with this barcode
            record_list = [r for r in amp_analysis_records if r.barcode == bc]
            log.info("Identified {0} consensus sequences for Barcode {1}".format(len(record_list), bc))
            filtered_records = [r for r in record_list if r.num_reads >= 20]
            unique_records = get_unique_records(filtered_records)
            fraction = 100 * round(len(unique_records) / float(len(record_list)), 3)
            log.info(
                "{0} of {1} ({2}%) consensus sequences passed all filters".format(
                    len(unique_records), len(record_list), fraction
                )
            )
            reference_file = os.path.join(output_dir, "reference.fasta")
            write_records(unique_records, reference_file)

            # Identify all high-quality, barcode-specific ZMWs and write them to file
            zmw_list = get_barcode_zmws(bc_reader, bc)
            zmw_list = filter_zmw_list(bash5, zmw_list, min_snr=min_snr)
            whitelist_file = os.path.join(output_dir, "whitelist.txt")
            write_zmw_whitelist(bash5, zmw_list, whitelist_file)

            # Resequence the selected consensus sequences with the selected ZMWs
            self.resequencer(baxh5_file, whitelist_file, reference_file, output=output_dir, min_length=min_length)

            log.info("Finished resequencing Barcode {0}\n".format(bc))