Exemple #1
0
def test_reverse_complemented_sequence():
    s = dnaio.Sequence("the_name", "ACGTTTGA", "B>%%BB5#")
    assert reverse_complemented_sequence(s) == dnaio.Sequence(
        "the_name", "TCAAACGT", "#5BB%%>B")

    s = dnaio.Sequence("the_name", "ACGTTTGA")
    assert reverse_complemented_sequence(s) == dnaio.Sequence(
        "the_name", "TCAAACGT")
Exemple #2
0
def test_append(tmpdir, fileformat, extension):
    s1 = dnaio.Sequence("s1", "ACGT", "HHHH")
    s2 = dnaio.Sequence("s2", "CGCA", "8383")
    path = str(tmpdir / ("out." + fileformat + extension))
    with dnaio.open(path, mode="w") as f:
        f.write(s1)
    with dnaio.open(path, mode="a") as f:
        f.write(s2)
    with xopen(path) as f:
        assert formatted_sequences([s1, s2], fileformat) == f.read()
Exemple #3
0
def reverse_complemented_sequence(sequence: dnaio.Sequence):
    if sequence.qualities is None:
        qualities = None
    else:
        qualities = sequence.qualities[::-1]
    return dnaio.Sequence(sequence.name, reverse_complement(sequence.sequence),
                          qualities)
Exemple #4
0
def test_write(tmpdir, extension):
    s = dnaio.Sequence('name', 'ACGT', 'HHHH')
    out_fastq = tmpdir.join("out.fastq" + extension)
    with dnaio.open(str(out_fastq), mode='w') as f:
        f.write(s)
    with xopen(out_fastq) as f:
        assert f.read() == '@name\nACGT\n+\nHHHH\n'
Exemple #5
0
def main(args):
    logger.info(f"Filtering reads not of length {args.length} bp.")

    time_start = time.time()

    # Read ABC fasta with UMI sequences and save read name and sequence.
    with dnaio.open(args.abcfile, mode="r") as file:
        umis = get_umis(file, length=args.length)

    time_filtered = time.time()
    logger.info(f"Time for filtering: {time_filtered - time_start} s")
    logger.info(f"Assigning UMIs to DBS clusters")

    with dnaio.open(args.dbsfile, mode="r") as file:
        dbs_umis = assign_to_dbs(file, umis)

    logger.info(f"DBS clusters linked to ABC: {len(dbs_umis)}")

    time_assign = time.time()
    logger.info(f"Time for assigning clusters: {time_assign - time_filtered} s")
    logger.info(f"Starting clustering of UMIs within clusters.")

    # Set clustering method
    # Based on https://umi-tools.readthedocs.io/en/latest/API.html
    clusterer = UMIClusterer(cluster_method='directional')

    with dnaio.open(args.output, fileformat="fasta", mode="w") as output:
        for dbs, umis in dbs_umis.items():
            # Encode each UMI for UMITools and perpare counts
            counts = {bytes(umi, encoding='utf-8'): len(reads) for umi, reads in umis.items()}

            stats["Total UMIs"] += len(counts)

            # Cluster umis
            clustered_umis = clusterer(counts, threshold=args.threshold)

            stats["Total clustered UMIs"] += len(clustered_umis)

            # Loop over clusters and write reads with corrected UMI.
            for cluster in clustered_umis:
                seqs = [seq.decode("utf-8") for seq in cluster]
                canonical_sequnce = seqs[0]

                for seq in seqs:
                    for read_name in umis[seq]:
                        read = dnaio.Sequence(read_name, canonical_sequnce)
                        output.write(read)

    time_end = time.time()
    logger.info(f"Time for clustering: {time_end - time_assign} s")
    logger.info(f"Total time to run: {time_end - time_start} s")

    # Send stats to log
    logger.info(f"Reads filtered out: {stats['Reads filtered out']:,}")
    logger.info(f"Reads kept: {stats['Reads kept']}")
    logger.info(f"Total UMIs: {stats['Total UMIs']}")
    logger.info(f"Total clustered UMIs: {stats['Total clustered UMIs']}")
Exemple #6
0
def test_write_interleaved(tmpdir, fileformat, extension):
    r1 = [
        dnaio.Sequence("s1", "ACGT", "HHHH"),
        dnaio.Sequence("s2", "CGCA", "8383"),
    ]
    r2 = [
        dnaio.Sequence("t1", "TCGT", "5HHH"),
        dnaio.Sequence("t2", "TGCA", "5383"),
    ]
    path = str(tmpdir / ("out.interleaved." + fileformat + extension))

    with dnaio.open(path, interleaved=True, fileformat=fileformat,
                    mode="w") as f:
        f.write(r1[0], r2[0])
        f.write(r1[1], r2[1])
    expected = [r1[0], r2[0], r1[1], r2[1]]
    with xopen(path) as f:
        assert formatted_sequences(expected, fileformat) == f.read()
Exemple #7
0
def test_write_paired(tmpdir, fileformat, extension):
    r1 = [
        dnaio.Sequence("s1", "ACGT", "HHHH"),
        dnaio.Sequence("s2", "CGCA", "8383"),
    ]
    r2 = [
        dnaio.Sequence("t1", "TCGT", "5HHH"),
        dnaio.Sequence("t2", "TGCA", "5383"),
    ]
    path1 = str(tmpdir / ("out.1." + fileformat + extension))
    path2 = str(tmpdir / ("out.2." + fileformat + extension))

    with dnaio.open(path1, file2=path2, fileformat=fileformat, mode="w") as f:
        f.write(r1[0], r2[0])
        f.write(r1[1], r2[1])
    with xopen(path1) as f:
        assert formatted_sequences(r1, fileformat) == f.read()
    with xopen(path2) as f:
        assert formatted_sequences(r2, fileformat) == f.read()
Exemple #8
0
def test_write_pathlib(tmpdir, fileformat, extension):
    s1 = dnaio.Sequence("s1", "ACGT", "HHHH")
    path = Path(str(tmpdir / ("out." + fileformat + extension)))
    with dnaio.open(path, mode="w") as f:
        f.write(s1)
    if fileformat == "fasta":
        expected = b">s1\nACGT\n"
    else:
        expected = b"@s1\nACGT\n+\nHHHH\n"
    with xopen(path, "rb") as f:
        assert f.read() == expected
Exemple #9
0
def test_write_with_xopen(tmpdir, fileformat, extension):
    s = dnaio.Sequence('name', 'ACGT', 'HHHH')
    out_fastq = str(tmpdir.join("out." + fileformat + extension))
    with xopen(out_fastq, 'wb') as outer_f:
        with dnaio.open(outer_f, mode='w', fileformat=fileformat) as f:
            f.write(s)

    with xopen(out_fastq) as f:
        if fileformat == "fasta":
            assert f.read() == ">name\nACGT\n"
        else:
            assert f.read() == "@name\nACGT\n+\nHHHH\n"
Exemple #10
0
def generate_modified_fastq(read1_file,
                            read2_file,
                            cb_file,
                            read1_coords,
                            modified_read_file,
                            num_mismatches=1,
                            num_n_threshold=3):
    """Matches cell barcodes and generates modified fastq file."""

    cell_barcodes = [
        i.rstrip().split('-')[0] for i in open_by_suffix(cb_file, mode='r')
    ]

    cb_index = create_index(barcodes=cell_barcodes,
                            num_mismatches=num_mismatches)

    read_counter = [int(), int()]
    with dnaio.open(file1=read1_file,
                    file2=read2_file,
                    fileformat='fastq',
                    mode='r') as f, dnaio.open(file1=modified_read_file,
                                               fileformat='fastq',
                                               mode='w') as f_out:

        for rec in f:
            read_counter[1] += 1

            read1, read2 = rec
            reads = (read1.name, read1.sequence, read1.qualities,
                     read2.sequence, read2.qualities)
            out = match_cell_barcodes(reads=reads,
                                      barcode_index=cb_index,
                                      read_coords=read1_coords,
                                      num_mismatches=num_mismatches,
                                      num_n_threshold=num_n_threshold)
            if out:
                read_counter[0] += 1

                read_name, read1_seq, _, read2_seq, read2_qual, bc, dist = out
                read_info = '#'.join([read1_seq, bc, str(dist)])

                read_name = ' '.join(
                    [read_name.split(' ')[0], 'RI:Z:' + read_info])

                s2 = dnaio.Sequence(read_name, read2_seq, read2_qual)
                f_out.write(s2)

    return modified_read_file, read_counter
Exemple #11
0
def test_formatted_sequence():
    s = dnaio.Sequence("s1", "ACGT", "HHHH")
    assert ">s1\nACGT\n" == formatted_sequence(s, "fasta")
    assert "@s1\nACGT\n+\nHHHH\n" == formatted_sequence(s, "fastq")
Exemple #12
0
import pytest


@pytest.fixture(params=["", ".gz", ".bz2", ".xz"])
def extension(request):
    return request.param


@pytest.fixture(params=["fasta", "fastq"])
def fileformat(request):
    return request.param


SIMPLE_RECORDS = {
    "fasta": [
        dnaio.Sequence("first_sequence", "SEQUENCE1"),
        dnaio.Sequence("second_sequence", "SEQUENCE2"),
    ],
    "fastq": [
        dnaio.Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"),
        dnaio.Sequence("second_sequence", "SEQUENCE2", "83<??:(61"),
    ],
}


def formatted_sequence(record, fileformat):
    if fileformat == "fastq":
        return "@{}\n{}\n+\n{}\n".format(record.name, record.sequence,
                                         record.qualities)
    else:
        return ">{}\n{}\n".format(record.name, record.sequence)
Exemple #13
0
def main(args):
    """Takes a fastq file barcode sequences in the header and writes a barcode fasta file with only unique entries. """

    logger.info(f'Filtering barcodes with less than {args.filter} reads')

    # Reading file and building initial bc dict with read counts
    barcode_counts = defaultdict(int)
    separator = "_" if not args.space_separation else " "
    with dnaio.open(args.input_fastq, fileformat="fastq", mode="r") as reader:
        for read in reader:
            barcode_sequence = read.name.split()[0].split(separator)[-1]
            barcode_counts[barcode_sequence] += 1

    # Indexing mode output writing
    if args.index:

        # Get barcode counts for each index of length=args.index.
        indexed_barcode_count, not_atcg_index = reduce_complexity(
            barcode_counts, index_size=args.index)

        # Make directory to put indexing files in unless already present
        try:
            os.mkdir(args.output_fasta)
        except FileExistsError:
            pass

        # Write one file per index
        for index_sequence in indexed_barcode_count.keys():
            output = f'{args.output_fasta}/{index_sequence}.fa'

            logger.info(f'Writing output to {output}')

            with dnaio.open(output, fileformat="fasta", mode='w') as openout:
                for bc_id, (barcode, read_count) in enumerate(
                        indexed_barcode_count[index_sequence].items(),
                        start=1):
                    if read_count < args.filter:
                        continue

                    fasta_name = f'>{bc_id}:{read_count}:{barcode}'
                    fasta_entry = dnaio.Sequence(name=fasta_name,
                                                 sequence=barcode)
                    openout.write(fasta_entry)

    # Non-indexing mode output writing
    else:
        # Check if file format matches fasta
        if any(
                args.output_fasta.endswith(extension)
                for extension in ['.fa', '.fasta']):
            output = args.output_fasta
        else:
            output = f'{args.output_fasta}.fasta'

        logger.info(f'Writing output to {output}')

        # Write all output to one file.
        with dnaio.open(output, fileformat="fasta", mode="w") as openout:
            for bc_id, (barcode,
                        read_count) in enumerate(barcode_counts.items(),
                                                 start=1):
                if read_count < args.filter:
                    continue

                fasta_name = f'>{bc_id}:{read_count}:{barcode}'
                fasta_entry = dnaio.Sequence(name=fasta_name, sequence=barcode)
                openout.write(fasta_entry)

    # Reporting
    logger.info(f'Unique BC count in input:\t{len(barcode_counts)}')
    if args.index:
        logger.info(
            f'BC count where N was in index (Omitted from tot. BC count):\t{not_atcg_index}'
        )
    logger.info("Finished")
Exemple #14
0
def main():
    args = get_arguments()
    logging.basicConfig(
        level=logging.INFO if not args.debug else logging.DEBUG,
        format="%(levelname)s: %(message)s")

    if args.reads:
        args.pcr_cycles = math.ceil(
            math.log(args.reads / args.number - 2 * args.pcr_efficency,
                     2 * args.pcr_efficency))

    #
    # Header
    #

    print('*' * WIDTH)
    print('SIMULATE BARCODE GENERATION, PCR AND SEQUENCING.')
    print('*' * WIDTH)

    print('Command line options:')

    if args.reads:
        args.pcr_cycles = math.ceil(
            math.log(args.reads / args.number, 2 * args.pcr_efficency))
        print('Note: Calculating PCR cycles based on reads!')

    print('-' * WIDTH)
    arguments = [f"{a}: {v}" for a, v in vars(args).items()]
    print("\n".join(arguments))
    print('-' * WIDTH)

    #
    # Create barcodes
    #
    start = time.time()

    barcode_options = [translate(base) for base in args.sequence]

    logging.info(f"Creating barcodes")
    barcodes = create_barcodes_generator(args.number, barcode_options)

    #
    # Run PCR
    #
    start_pcr = time.time()
    logging.info(f"Running PCR")
    final_barcodes = pcr_cycles(barcodes,
                                efficiency=args.pcr_efficency,
                                pcr_cycles=args.pcr_cycles,
                                error_rate=args.error_rate_pcr,
                                nprocs=args.processes)

    logging.info(f"PCR done, time: {time.time() - start_pcr:.2f} s")
    logging.info(f"PCR errors generated: {counter['pcr errors']:,}")

    #
    # Sequnencing
    #
    start_seq = time.time()

    logging.info(f"Sequencing")

    barcodes_after_seq = add_sequencing_errors(
        final_barcodes, seq_error_rate=args.error_rate_seq)

    logging.info(f"Sequencing done, time: {time.time() - start_seq:.2f} s")
    logging.info(
        f"Sequencing errors generated: {counter['sequencing errors']:,}")

    #
    # Output
    #
    print('-' * WIDTH)
    print('Results')
    print('-' * WIDTH)
    print(
        f"Number of barcodes before sequencing:      {counter['Barcodes start']:7,}"
    )
    print(
        f"Number of uniq barcodes before sequencing: {len(true_barcodes):7,}")
    print(
        f"Number of barcodes after PCR:              {len(final_barcodes):7,}")
    print(
        f"Number of barcode molecules after PCR:     {sum(final_barcodes.values()):7,}"
    )
    print(
        f"Number of barcodes after sequencing:       {len(barcodes_after_seq):7,}"
    )
    print(
        f"Number of barcode reads after sequencing:  {sum(barcodes_after_seq.values()):7,}"
    )
    print('-' * WIDTH)

    logging.info(f"Total run time: {time.time() - start:.2f} s")

    distribution = sorted(
        collections.Counter(barcodes_after_seq.values()).items())
    print(f"Freq\tReads\tRatio")
    for reads, freq in distribution:
        print(f"{freq}\t{reads}\t{freq/distribution[0][1]:.5f}")
        if reads >= 10:
            break

    if args.debug:
        for p in sorted(list(collections.Counter(barcodes).items())):
            print(p)
        print()
        for p in sorted(list(final_barcodes.items())):
            print(p)
        print()
        for p in sorted(list(barcodes_after_seq.items())):
            print(p)

    if args.output:
        if args.output_format == 'cd-hit':
            with dnaio.open(args.output, mode='w',
                            fileformat='fasta') as writer:
                for nr, (barcode,
                         count) in enumerate(iter(barcodes_after_seq.items())):
                    is_true = 0
                    if barcode in true_barcodes:
                        is_true = 1

                    record = dnaio.Sequence(
                        f"{is_true}:{nr}:{count}:{barcode}", barcode)
                    writer.write(record)
        if args.output_format == "starcode":
            with open(args.output,
                      'w') as writer, open(f"{args.output}.true",
                                           'w') as true_writer:
                for nr, (barcode,
                         count) in enumerate(iter(barcodes_after_seq.items())):
                    if barcode in true_barcodes:
                        print(barcode, file=true_writer)

                    print(f"{barcode}\t{count}", file=writer)