def test_reverse_complemented_sequence(): s = dnaio.Sequence("the_name", "ACGTTTGA", "B>%%BB5#") assert reverse_complemented_sequence(s) == dnaio.Sequence( "the_name", "TCAAACGT", "#5BB%%>B") s = dnaio.Sequence("the_name", "ACGTTTGA") assert reverse_complemented_sequence(s) == dnaio.Sequence( "the_name", "TCAAACGT")
def test_append(tmpdir, fileformat, extension): s1 = dnaio.Sequence("s1", "ACGT", "HHHH") s2 = dnaio.Sequence("s2", "CGCA", "8383") path = str(tmpdir / ("out." + fileformat + extension)) with dnaio.open(path, mode="w") as f: f.write(s1) with dnaio.open(path, mode="a") as f: f.write(s2) with xopen(path) as f: assert formatted_sequences([s1, s2], fileformat) == f.read()
def reverse_complemented_sequence(sequence: dnaio.Sequence): if sequence.qualities is None: qualities = None else: qualities = sequence.qualities[::-1] return dnaio.Sequence(sequence.name, reverse_complement(sequence.sequence), qualities)
def test_write(tmpdir, extension): s = dnaio.Sequence('name', 'ACGT', 'HHHH') out_fastq = tmpdir.join("out.fastq" + extension) with dnaio.open(str(out_fastq), mode='w') as f: f.write(s) with xopen(out_fastq) as f: assert f.read() == '@name\nACGT\n+\nHHHH\n'
def main(args): logger.info(f"Filtering reads not of length {args.length} bp.") time_start = time.time() # Read ABC fasta with UMI sequences and save read name and sequence. with dnaio.open(args.abcfile, mode="r") as file: umis = get_umis(file, length=args.length) time_filtered = time.time() logger.info(f"Time for filtering: {time_filtered - time_start} s") logger.info(f"Assigning UMIs to DBS clusters") with dnaio.open(args.dbsfile, mode="r") as file: dbs_umis = assign_to_dbs(file, umis) logger.info(f"DBS clusters linked to ABC: {len(dbs_umis)}") time_assign = time.time() logger.info(f"Time for assigning clusters: {time_assign - time_filtered} s") logger.info(f"Starting clustering of UMIs within clusters.") # Set clustering method # Based on https://umi-tools.readthedocs.io/en/latest/API.html clusterer = UMIClusterer(cluster_method='directional') with dnaio.open(args.output, fileformat="fasta", mode="w") as output: for dbs, umis in dbs_umis.items(): # Encode each UMI for UMITools and perpare counts counts = {bytes(umi, encoding='utf-8'): len(reads) for umi, reads in umis.items()} stats["Total UMIs"] += len(counts) # Cluster umis clustered_umis = clusterer(counts, threshold=args.threshold) stats["Total clustered UMIs"] += len(clustered_umis) # Loop over clusters and write reads with corrected UMI. for cluster in clustered_umis: seqs = [seq.decode("utf-8") for seq in cluster] canonical_sequnce = seqs[0] for seq in seqs: for read_name in umis[seq]: read = dnaio.Sequence(read_name, canonical_sequnce) output.write(read) time_end = time.time() logger.info(f"Time for clustering: {time_end - time_assign} s") logger.info(f"Total time to run: {time_end - time_start} s") # Send stats to log logger.info(f"Reads filtered out: {stats['Reads filtered out']:,}") logger.info(f"Reads kept: {stats['Reads kept']}") logger.info(f"Total UMIs: {stats['Total UMIs']}") logger.info(f"Total clustered UMIs: {stats['Total clustered UMIs']}")
def test_write_interleaved(tmpdir, fileformat, extension): r1 = [ dnaio.Sequence("s1", "ACGT", "HHHH"), dnaio.Sequence("s2", "CGCA", "8383"), ] r2 = [ dnaio.Sequence("t1", "TCGT", "5HHH"), dnaio.Sequence("t2", "TGCA", "5383"), ] path = str(tmpdir / ("out.interleaved." + fileformat + extension)) with dnaio.open(path, interleaved=True, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) f.write(r1[1], r2[1]) expected = [r1[0], r2[0], r1[1], r2[1]] with xopen(path) as f: assert formatted_sequences(expected, fileformat) == f.read()
def test_write_paired(tmpdir, fileformat, extension): r1 = [ dnaio.Sequence("s1", "ACGT", "HHHH"), dnaio.Sequence("s2", "CGCA", "8383"), ] r2 = [ dnaio.Sequence("t1", "TCGT", "5HHH"), dnaio.Sequence("t2", "TGCA", "5383"), ] path1 = str(tmpdir / ("out.1." + fileformat + extension)) path2 = str(tmpdir / ("out.2." + fileformat + extension)) with dnaio.open(path1, file2=path2, fileformat=fileformat, mode="w") as f: f.write(r1[0], r2[0]) f.write(r1[1], r2[1]) with xopen(path1) as f: assert formatted_sequences(r1, fileformat) == f.read() with xopen(path2) as f: assert formatted_sequences(r2, fileformat) == f.read()
def test_write_pathlib(tmpdir, fileformat, extension): s1 = dnaio.Sequence("s1", "ACGT", "HHHH") path = Path(str(tmpdir / ("out." + fileformat + extension))) with dnaio.open(path, mode="w") as f: f.write(s1) if fileformat == "fasta": expected = b">s1\nACGT\n" else: expected = b"@s1\nACGT\n+\nHHHH\n" with xopen(path, "rb") as f: assert f.read() == expected
def test_write_with_xopen(tmpdir, fileformat, extension): s = dnaio.Sequence('name', 'ACGT', 'HHHH') out_fastq = str(tmpdir.join("out." + fileformat + extension)) with xopen(out_fastq, 'wb') as outer_f: with dnaio.open(outer_f, mode='w', fileformat=fileformat) as f: f.write(s) with xopen(out_fastq) as f: if fileformat == "fasta": assert f.read() == ">name\nACGT\n" else: assert f.read() == "@name\nACGT\n+\nHHHH\n"
def generate_modified_fastq(read1_file, read2_file, cb_file, read1_coords, modified_read_file, num_mismatches=1, num_n_threshold=3): """Matches cell barcodes and generates modified fastq file.""" cell_barcodes = [ i.rstrip().split('-')[0] for i in open_by_suffix(cb_file, mode='r') ] cb_index = create_index(barcodes=cell_barcodes, num_mismatches=num_mismatches) read_counter = [int(), int()] with dnaio.open(file1=read1_file, file2=read2_file, fileformat='fastq', mode='r') as f, dnaio.open(file1=modified_read_file, fileformat='fastq', mode='w') as f_out: for rec in f: read_counter[1] += 1 read1, read2 = rec reads = (read1.name, read1.sequence, read1.qualities, read2.sequence, read2.qualities) out = match_cell_barcodes(reads=reads, barcode_index=cb_index, read_coords=read1_coords, num_mismatches=num_mismatches, num_n_threshold=num_n_threshold) if out: read_counter[0] += 1 read_name, read1_seq, _, read2_seq, read2_qual, bc, dist = out read_info = '#'.join([read1_seq, bc, str(dist)]) read_name = ' '.join( [read_name.split(' ')[0], 'RI:Z:' + read_info]) s2 = dnaio.Sequence(read_name, read2_seq, read2_qual) f_out.write(s2) return modified_read_file, read_counter
def test_formatted_sequence(): s = dnaio.Sequence("s1", "ACGT", "HHHH") assert ">s1\nACGT\n" == formatted_sequence(s, "fasta") assert "@s1\nACGT\n+\nHHHH\n" == formatted_sequence(s, "fastq")
import pytest @pytest.fixture(params=["", ".gz", ".bz2", ".xz"]) def extension(request): return request.param @pytest.fixture(params=["fasta", "fastq"]) def fileformat(request): return request.param SIMPLE_RECORDS = { "fasta": [ dnaio.Sequence("first_sequence", "SEQUENCE1"), dnaio.Sequence("second_sequence", "SEQUENCE2"), ], "fastq": [ dnaio.Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"), dnaio.Sequence("second_sequence", "SEQUENCE2", "83<??:(61"), ], } def formatted_sequence(record, fileformat): if fileformat == "fastq": return "@{}\n{}\n+\n{}\n".format(record.name, record.sequence, record.qualities) else: return ">{}\n{}\n".format(record.name, record.sequence)
def main(args): """Takes a fastq file barcode sequences in the header and writes a barcode fasta file with only unique entries. """ logger.info(f'Filtering barcodes with less than {args.filter} reads') # Reading file and building initial bc dict with read counts barcode_counts = defaultdict(int) separator = "_" if not args.space_separation else " " with dnaio.open(args.input_fastq, fileformat="fastq", mode="r") as reader: for read in reader: barcode_sequence = read.name.split()[0].split(separator)[-1] barcode_counts[barcode_sequence] += 1 # Indexing mode output writing if args.index: # Get barcode counts for each index of length=args.index. indexed_barcode_count, not_atcg_index = reduce_complexity( barcode_counts, index_size=args.index) # Make directory to put indexing files in unless already present try: os.mkdir(args.output_fasta) except FileExistsError: pass # Write one file per index for index_sequence in indexed_barcode_count.keys(): output = f'{args.output_fasta}/{index_sequence}.fa' logger.info(f'Writing output to {output}') with dnaio.open(output, fileformat="fasta", mode='w') as openout: for bc_id, (barcode, read_count) in enumerate( indexed_barcode_count[index_sequence].items(), start=1): if read_count < args.filter: continue fasta_name = f'>{bc_id}:{read_count}:{barcode}' fasta_entry = dnaio.Sequence(name=fasta_name, sequence=barcode) openout.write(fasta_entry) # Non-indexing mode output writing else: # Check if file format matches fasta if any( args.output_fasta.endswith(extension) for extension in ['.fa', '.fasta']): output = args.output_fasta else: output = f'{args.output_fasta}.fasta' logger.info(f'Writing output to {output}') # Write all output to one file. with dnaio.open(output, fileformat="fasta", mode="w") as openout: for bc_id, (barcode, read_count) in enumerate(barcode_counts.items(), start=1): if read_count < args.filter: continue fasta_name = f'>{bc_id}:{read_count}:{barcode}' fasta_entry = dnaio.Sequence(name=fasta_name, sequence=barcode) openout.write(fasta_entry) # Reporting logger.info(f'Unique BC count in input:\t{len(barcode_counts)}') if args.index: logger.info( f'BC count where N was in index (Omitted from tot. BC count):\t{not_atcg_index}' ) logger.info("Finished")
def main(): args = get_arguments() logging.basicConfig( level=logging.INFO if not args.debug else logging.DEBUG, format="%(levelname)s: %(message)s") if args.reads: args.pcr_cycles = math.ceil( math.log(args.reads / args.number - 2 * args.pcr_efficency, 2 * args.pcr_efficency)) # # Header # print('*' * WIDTH) print('SIMULATE BARCODE GENERATION, PCR AND SEQUENCING.') print('*' * WIDTH) print('Command line options:') if args.reads: args.pcr_cycles = math.ceil( math.log(args.reads / args.number, 2 * args.pcr_efficency)) print('Note: Calculating PCR cycles based on reads!') print('-' * WIDTH) arguments = [f"{a}: {v}" for a, v in vars(args).items()] print("\n".join(arguments)) print('-' * WIDTH) # # Create barcodes # start = time.time() barcode_options = [translate(base) for base in args.sequence] logging.info(f"Creating barcodes") barcodes = create_barcodes_generator(args.number, barcode_options) # # Run PCR # start_pcr = time.time() logging.info(f"Running PCR") final_barcodes = pcr_cycles(barcodes, efficiency=args.pcr_efficency, pcr_cycles=args.pcr_cycles, error_rate=args.error_rate_pcr, nprocs=args.processes) logging.info(f"PCR done, time: {time.time() - start_pcr:.2f} s") logging.info(f"PCR errors generated: {counter['pcr errors']:,}") # # Sequnencing # start_seq = time.time() logging.info(f"Sequencing") barcodes_after_seq = add_sequencing_errors( final_barcodes, seq_error_rate=args.error_rate_seq) logging.info(f"Sequencing done, time: {time.time() - start_seq:.2f} s") logging.info( f"Sequencing errors generated: {counter['sequencing errors']:,}") # # Output # print('-' * WIDTH) print('Results') print('-' * WIDTH) print( f"Number of barcodes before sequencing: {counter['Barcodes start']:7,}" ) print( f"Number of uniq barcodes before sequencing: {len(true_barcodes):7,}") print( f"Number of barcodes after PCR: {len(final_barcodes):7,}") print( f"Number of barcode molecules after PCR: {sum(final_barcodes.values()):7,}" ) print( f"Number of barcodes after sequencing: {len(barcodes_after_seq):7,}" ) print( f"Number of barcode reads after sequencing: {sum(barcodes_after_seq.values()):7,}" ) print('-' * WIDTH) logging.info(f"Total run time: {time.time() - start:.2f} s") distribution = sorted( collections.Counter(barcodes_after_seq.values()).items()) print(f"Freq\tReads\tRatio") for reads, freq in distribution: print(f"{freq}\t{reads}\t{freq/distribution[0][1]:.5f}") if reads >= 10: break if args.debug: for p in sorted(list(collections.Counter(barcodes).items())): print(p) print() for p in sorted(list(final_barcodes.items())): print(p) print() for p in sorted(list(barcodes_after_seq.items())): print(p) if args.output: if args.output_format == 'cd-hit': with dnaio.open(args.output, mode='w', fileformat='fasta') as writer: for nr, (barcode, count) in enumerate(iter(barcodes_after_seq.items())): is_true = 0 if barcode in true_barcodes: is_true = 1 record = dnaio.Sequence( f"{is_true}:{nr}:{count}:{barcode}", barcode) writer.write(record) if args.output_format == "starcode": with open(args.output, 'w') as writer, open(f"{args.output}.true", 'w') as true_writer: for nr, (barcode, count) in enumerate(iter(barcodes_after_seq.items())): if barcode in true_barcodes: print(barcode, file=true_writer) print(f"{barcode}\t{count}", file=writer)