def test_get_LEA_seq_consensus_seqs(self):
        barcode_type = int(7)
        barcode_len = 7
        barcode_correction_fn = None
        max_barcode_errors = 1.5
        min_consensus = 0.66
        max_cluster_ratio = 2.5
        min_difference_in_bcs = 0.86
        fwd_length = 19
        rev_length = 19
        min_reads_per_random_bc = 1
        min_diff_in_clusters = self.min_difference_in_clusters
        barcode_column = 'BarcodeSequence'
        reverse_primer_column = 'ReversePrimer'

        function_call, _ = get_LEA_seq_consensus_seqs(
            self.fwd_read_fh, self.rev_read_fh, self.mapping_fp, self.temp_dir,
            barcode_type, barcode_len, barcode_correction_fn,
            max_barcode_errors, min_consensus, max_cluster_ratio,
            min_difference_in_bcs, fwd_length, rev_length,
            min_reads_per_random_bc, min_diff_in_clusters, barcode_column,
            reverse_primer_column)
        actual = function_call['Sample1']['AGCTACGAGCTATTGC']
        expected = 'AAAAAAAAAAAAAAAAAAA^AAAAAAAAAAAAAAAAAA'
        self.assertEqual(actual, expected)
        # this call tests the second condition of if loop
        # in the function get_consensus_seq_lookup
        # i.e. select the majority sequence, as the cluster ratio
        # between max_cluster/second_best_cluster in the fwd_read_data
        # (and rev_read_data) is 3/1 > 2.5,
        # so the function get_consensus will not be called

        fn_call, _ = get_LEA_seq_consensus_seqs(
            self.get_cons_fwd_read_fh, self.get_cons_rev_read_fh,
            self.get_cons_mapping_fp, self.temp_dir, barcode_type, barcode_len,
            barcode_correction_fn, max_barcode_errors, min_consensus,
            max_cluster_ratio, min_difference_in_bcs, fwd_length, rev_length,
            min_reads_per_random_bc, min_diff_in_clusters, barcode_column,
            reverse_primer_column)

        get_cons_actual = fn_call['Sample1']['AGCTACGAGCTATTGC']
        get_cons_expected = 'AAAAAAAAAACAAAAAAAA^AAAAAAAAAATAAAAATA'
        self.assertEqual(get_cons_actual, get_cons_expected)
        # this call tests the third condition of if loop
        # in the function get_consensus_seq_lookup.
        # i.e. calls the get_consensus function, as the cluster ratio
        # between max_cluster/second_best_cluster in the get_cons_fwd_read_data
        # (and get_cons_rev_read_data) is 2/1 ( < 2.5)
        # so the majority sequence will not be selected

        get_cons_actual = fn_call['Sample2']['AGCTACGCATCAAGGG']
        get_cons_expected = 'AAAAAAAAAATAAAAAAAA^TTAAAAAAAAAAAAGAAAA'
        self.assertEqual(get_cons_actual, get_cons_expected)

        self.assertFalse(len(fn_call) <= 1,
                         msg="The get_consensus_seqs_lookup function "
                         "has returned early, without completing "
                         "the three 'for' loops.")
Ejemplo n.º 2
0
    def test_get_LEA_seq_consensus_seqs(self):
        barcode_type = int(7)
        barcode_len = 7
        barcode_correction_fn = None
        max_barcode_errors = 1.5
        min_consensus = 0.66
        max_cluster_ratio = 2.5
        min_difference_in_bcs = 0.86
        fwd_length = 19
        rev_length = 19
        min_reads_per_random_bc = 1
        min_diff_in_clusters = self.min_difference_in_clusters
        barcode_column = 'BarcodeSequence'
        reverse_primer_column = 'ReversePrimer'

        function_call, _ = get_LEA_seq_consensus_seqs(self.fwd_read_data,
                                                      self.rev_read_data,
                                                      self.mapping_fp,
                                                      self.temp_dir,
                                                      barcode_type,
                                                      barcode_len,
                                                      barcode_correction_fn,
                                                      max_barcode_errors,
                                                      min_consensus,
                                                      max_cluster_ratio,
                                                      min_difference_in_bcs,
                                                      fwd_length,
                                                      rev_length,
                                                      min_reads_per_random_bc,
                                                      min_diff_in_clusters,
                                                      barcode_column,
                                                      reverse_primer_column)

        actual = function_call['Sample1']['AGCTACGAGCTATTGC']
        expected = 'AAAAAAAAAAAAAAAAAAA^AAAAAAAAAAAAAAAAAA'
        self.assertEqual(actual, expected)
    def test_get_LEA_seq_consensus_seqs(self):
        barcode_type = int(7)
        barcode_len = 7
        barcode_correction_fn = None
        max_barcode_errors = 1.5
        min_consensus = 0.66
        max_cluster_ratio = 2.5
        min_difference_in_bcs = 0.86
        fwd_length = 19
        rev_length = 19
        min_reads_per_random_bc = 1
        min_diff_in_clusters = self.min_difference_in_clusters
        barcode_column = 'BarcodeSequence'
        reverse_primer_column = 'ReversePrimer'

        function_call, _ = get_LEA_seq_consensus_seqs(self.fwd_read_fh,
                                                      self.rev_read_fh,
                                                      self.mapping_fp,
                                                      self.temp_dir,
                                                      barcode_type,
                                                      barcode_len,
                                                      barcode_correction_fn,
                                                      max_barcode_errors,
                                                      min_consensus,
                                                      max_cluster_ratio,
                                                      min_difference_in_bcs,
                                                      fwd_length,
                                                      rev_length,
                                                      min_reads_per_random_bc,
                                                      min_diff_in_clusters,
                                                      barcode_column,
                                                      reverse_primer_column)
        actual = function_call['Sample1']['AGCTACGAGCTATTGC']
        expected = 'AAAAAAAAAAAAAAAAAAA^AAAAAAAAAAAAAAAAAA'
        self.assertEqual(actual, expected)
        # this call tests the second condition of if loop
        # in the function get_consensus_seq_lookup
        # i.e. select the majority sequence, as the cluster ratio
        # between max_cluster/second_best_cluster in the fwd_read_data
        # (and rev_read_data) is 3/1 > 2.5,
        # so the function get_consensus will not be called

        fn_call, _ = get_LEA_seq_consensus_seqs(self.get_cons_fwd_read_fh,
                                                self.get_cons_rev_read_fh,
                                                self.get_cons_mapping_fp,
                                                self.temp_dir,
                                                barcode_type,
                                                barcode_len,
                                                barcode_correction_fn,
                                                max_barcode_errors,
                                                min_consensus,
                                                max_cluster_ratio,
                                                min_difference_in_bcs,
                                                fwd_length,
                                                rev_length,
                                                min_reads_per_random_bc,
                                                min_diff_in_clusters,
                                                barcode_column,
                                                reverse_primer_column)

        get_cons_actual = fn_call['Sample1']['AGCTACGAGCTATTGC']
        get_cons_expected = 'AAAAAAAAAACAAAAAAAA^AAAAAAAAAATAAAAATA'
        self.assertEqual(get_cons_actual, get_cons_expected)
        # this call tests the third condition of if loop
        # in the function get_consensus_seq_lookup.
        # i.e. calls the get_consensus function, as the cluster ratio
        # between max_cluster/second_best_cluster in the get_cons_fwd_read_data
        # (and get_cons_rev_read_data) is 2/1 ( < 2.5)
        # so the majority sequence will not be selected

        get_cons_actual = fn_call['Sample2']['AGCTACGCATCAAGGG']
        get_cons_expected = 'AAAAAAAAAATAAAAAAAA^TTAAAAAAAAAAAAGAAAA'
        self.assertEqual(get_cons_actual, get_cons_expected)

        self.assertFalse(len(fn_call) <= 1,
                         msg="The get_consensus_seqs_lookup function "
                         "has returned early, without completing "
                         "the three 'for' loops.")
Ejemplo n.º 4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors
    mapping_fp = opts.mapping_fp
    sequence_read_fps = opts.sequence_read_fps
    min_consensus = opts.min_consensus
    max_cluster_ratio = opts.max_cluster_ratio
    output_dir = opts.output_dir
    min_difference_in_bcs = opts.min_difference_in_bcs
    fwd_length = opts.fwd_length
    rev_length = opts.rev_length
    min_reads_per_random_bc = opts.min_reads_per_random_bc
    min_diff_in_clusters = opts.min_difference_in_clusters
    barcode_column = opts.header_barcode_column
    reverse_primer_column = opts.reverse_primer_column

    create_dir(output_dir)
    fwd_consensus_outfile = open(path.join(output_dir, "fwd.fna"), "w")
    rev_consensus_outfile = open(path.join(output_dir, "rev.fna"), "w")
    log_file = open(path.join(output_dir, "log.txt"), "w")

    if barcode_type == 'golay_12':
        barcode_correction_fn = decode_golay_12
        barcode_len = 12
    else:
        barcode_correction_fn = None

        try:
            barcode_len = int(barcode_type)
        except ValueError:
            option_parser.error("Invalid barcode type '%s'. The barcode type "
                                "must be either golay_12 or a positive "
                                "integer indicating the barcode length." %
                                barcode_type)

    if max_barcode_errors < 0:
        option_parser.error("--max_barcode_errors must be greater than or "
                            "equal to zero. You provided %.4f." %
                            max_barcode_errors)

    if barcode_len < 1:
        option_parser.error("Invalid barcode length: %d. Must be greater "
                            "than zero." % barcode_len)

    if len(sequence_read_fps) != 2:
        option_parser.error("You must provide exactly two sequence read "
                            "filepaths, the first for forward reads and "
                            "second for reverse reads. You specified %d "
                            "filepaths." % len(sequence_read_fps))

    fwd_read_f = open(sequence_read_fps[0], 'U')
    rev_read_f = open(sequence_read_fps[1], 'U')

    map_f = open(mapping_fp, 'U')

    (consensus_seq_lookup,
     log_out) = get_LEA_seq_consensus_seqs(fwd_read_f,
                                           rev_read_f,
                                           map_f,
                                           output_dir,
                                           barcode_type,
                                           barcode_len,
                                           barcode_correction_fn,
                                           max_barcode_errors,
                                           min_consensus,
                                           max_cluster_ratio,
                                           min_difference_in_bcs,
                                           fwd_length,
                                           rev_length,
                                           min_reads_per_random_bc,
                                           min_diff_in_clusters,
                                           barcode_column,
                                           reverse_primer_column)

    for sample_id in consensus_seq_lookup:
        for bc_index, rand_bc in enumerate(consensus_seq_lookup[sample_id]):
            consensus_seq = consensus_seq_lookup[sample_id][rand_bc]
            fwd_consensus, rev_consensus = consensus_seq.split('^')
            fwd_consensus_outfile.write(">{}_{}\n{}\n".format(
                sample_id, bc_index, fwd_consensus))
            rev_consensus_outfile.write(">{}_{}\n{}\n".format(
                sample_id, bc_index, rev_consensus))

    log_file.write(log_out)
    log_file.close()

    fwd_read_f.close()
    rev_read_f.close()
    fwd_consensus_outfile.close()
    rev_consensus_outfile.close()
    map_f.close()
Ejemplo n.º 5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors
    mapping_fp = opts.mapping_fp
    sequence_read_fps = opts.sequence_read_fps
    min_consensus = opts.min_consensus
    max_cluster_ratio = opts.max_cluster_ratio
    output_dir = opts.output_dir
    min_difference_in_bcs = opts.min_difference_in_bcs
    fwd_length = opts.fwd_length
    rev_length = opts.rev_length
    min_reads_per_random_bc = opts.min_reads_per_random_bc
    min_diff_in_clusters = opts.min_difference_in_clusters
    barcode_column = opts.header_barcode_column
    reverse_primer_column = opts.reverse_primer_column
    create_dir(output_dir)
    fwd_consensus_outfile = open(path.join(output_dir, "fwd.fna"), "w")
    rev_consensus_outfile = open(path.join(output_dir, "rev.fna"), "w")
    log_file = open(path.join(output_dir, "log.txt"), "w")

    if barcode_type == 'golay_12':
        barcode_correction_fn = decode_golay_12
        barcode_len = 12
    else:
        barcode_correction_fn = None

        try:
            barcode_len = int(barcode_type)
        except ValueError:
            option_parser.error("Invalid barcode type '%s'. The barcode type "
                                "must be either golay_12 or a positive "
                                "integer indicating the barcode length." %
                                barcode_type)

    if max_barcode_errors < 0:
        option_parser.error("--max_barcode_errors must be greater than or "
                            "equal to zero. You provided %.4f." %
                            max_barcode_errors)

    if min_diff_in_clusters < 0 or min_diff_in_clusters > 1:
        option_parser.error("--min_difference_in_clusters must be "
                            "between 0 to 1. You provided %.4f." %
                            min_diff_in_clusters)

    if min_difference_in_bcs < 0 or min_difference_in_bcs > 1:
        option_parser.error("--min_difference_in_bcs must be between 0 to 1."
                            " You provided %.4f." % min_difference_in_bcs)

    if barcode_len < 1:
        option_parser.error("Invalid barcode length: %d. Must be greater "
                            "than zero." % barcode_len)

    if len(sequence_read_fps) != 2:
        option_parser.error("You must provide exactly two sequence read "
                            "filepaths, the first for forward reads and "
                            "second for reverse reads. You specified %d "
                            "filepaths." % len(sequence_read_fps))

    fwd_read_f = open(sequence_read_fps[0], 'U')
    rev_read_f = open(sequence_read_fps[1], 'U')

    map_f = open(mapping_fp, 'U')

    (consensus_seq_lookup, log_out) = get_LEA_seq_consensus_seqs(
        fwd_read_f, rev_read_f, map_f, output_dir, barcode_type, barcode_len,
        barcode_correction_fn, max_barcode_errors, min_consensus,
        max_cluster_ratio, min_difference_in_bcs, fwd_length, rev_length,
        min_reads_per_random_bc, min_diff_in_clusters, barcode_column,
        reverse_primer_column)

    for sample_id in consensus_seq_lookup:
        for bc_index, rand_bc in enumerate(consensus_seq_lookup[sample_id]):
            consensus_seq = consensus_seq_lookup[sample_id][rand_bc]
            fwd_consensus, rev_consensus = consensus_seq.split('^')
            fwd_consensus_outfile.write(">{}_{}\n{}\n".format(
                sample_id, bc_index, fwd_consensus))
            rev_consensus_outfile.write(">{}_{}\n{}\n".format(
                sample_id, bc_index, rev_consensus))

    log_file.write(log_out)
    log_file.close()

    fwd_read_f.close()
    rev_read_f.close()
    fwd_consensus_outfile.close()
    rev_consensus_outfile.close()
    map_f.close()