def test_sequence_reader(self): # test the autodetection with dnaio.open("tests/data/simple.fastq") as f: reads = list(f) assert reads == simple_fastq with dnaio.open("tests/data/simple.fasta") as f: reads = list(f) assert reads == simple_fasta with open("tests/data/simple.fastq", 'rb') as f: reads = list(dnaio.open(f)) assert reads == simple_fastq # make the name attribute unavailable with open("tests/data/simple.fastq", 'rb') as f: data = f.read() bio = BytesIO(data) reads = list(dnaio.open(bio)) assert reads == simple_fastq with open("tests/data/simple.fasta", 'rb') as f: data = f.read() bio = BytesIO(data) reads = list(dnaio.open(bio)) assert reads == simple_fasta
def __call__(self, read1, read2, matches1, matches2): """ Write the read to the proper output file according to the most recent matches both on R1 and R2 """ assert read2 is not None name1 = matches1[-1].adapter.name if matches1 else None name2 = matches2[-1].adapter.name if matches2 else None key = (name1, name2) if key not in self.writers: if name1 is None: name1 = self.untrimmed_name if name2 is None: name2 = self.untrimmed_name if name1 is None or name2 is None: return DISCARD path1 = self._make_path(self.template, name1, name2) path2 = self._make_path(self.paired_template, name1, name2) self.writers[key] = ( dnaio.open(path1, mode='w', qualities=self.qualities), dnaio.open(path2, mode='w', qualities=self.qualities), ) writer1, writer2 = self.writers[key] self.written += 1 self.written_bp[0] += len(read1) self.written_bp[1] += len(read2) writer1.write(read1) writer2.write(read2) return DISCARD
def test_autodetect_fastq_format(self): path = os.path.join(self._tmpdir, 'tmp.fastq') with dnaio.open(path, mode='w') as f: assert isinstance(f, FastqWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: assert list(f) == simple_fastq
def test_write_qualities_to_fasta(self): path = os.path.join(self._tmpdir, 'tmp.fasta') with dnaio.open(path, mode='w', qualities=True) as f: assert isinstance(f, FastaWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: assert list(f) == simple_fasta
def main(): """Invoke when run directly as a program.""" args = parse_arguments() if args.fq2: input_fastq = args.fq + args.fq2 else: input_fastq = args.fq for fq_file in input_fastq: if not isfile(fq_file): set_error('Input file {} does not exist'.format(basename(fq_file))) if not fq_file.lower().endswith(SUPPORTED_EXTENSIONS): set_error('Unrecognized file name extension in file {}. ' 'Supported file name extensions are {}.'.format(fq_file, SUPPORTED_EXTENSIONS)) # Reduce the probability of uploading the FASTQ files with the same # content multiple times (as multiple lanes or mates). if len(set(input_fastq)) != len(input_fastq): seen_files = [item for item, count in collections.Counter(input_fastq).items() if count > 1] set_error('Non-unique input file names detected: {}.'.format(seen_files)) if args.fq2 and len(args.fq) != len(args.fq2): set_error('The number of mate-pair files in split-lane samples must match. ' '{} and {} input files were given for the -fq and -fq2 inputs, ' 'respectively.'.format(len(args.fq), len(args.fq2))) if args.fq2: for mate1, mate2 in zip(args.fq, args.fq2): try: paired_reads = dnaio.open(mate1, file2=mate2, fileformat='fastq') if not any(paired_reads): set_error( 'Mate-pair files {} and {} contain no read sequences.'.format(basename(mate1), basename(mate2)) ) else: for read in paired_reads: continue print('Successfully validated mate-pair files {} and {}.'.format(basename(mate1), basename(mate2))) except (FastqFormatError, FileFormatError) as dnaio_error: set_error( 'Format error in mate-pairs {} and {}. {}'.format( basename(mate1), basename(mate2), str(dnaio_error)) ) else: for fq in args.fq: try: reads = dnaio.open(fq, fileformat='fastq') if not any(reads): set_error('Input file {} contains no read sequences.'.format(basename(fq))) else: for read in reads: continue print('Successfully validated reads file {}.'.format(basename(fq))) except (FastqFormatError, FileFormatError) as dnaio_error: set_error('Error in file {}. {}'.format(basename(fq), str(dnaio_error)))
def test_autodetect_fasta_format(self, tmpdir): path = str(tmpdir.join('tmp.fasta')) with dnaio.open(path, mode='w') as f: assert isinstance(f, FastaWriter) for seq in simple_fastq: f.write(seq) with dnaio.open(path) as f: records = list(f) assert records == simple_fasta
def main(args): logger.info(f"Filtering reads not of length {args.length} bp.") time_start = time.time() # Read ABC fasta with UMI sequences and save read name and sequence. with dnaio.open(args.abcfile, mode="r") as file: umis = get_umis(file, length=args.length) time_filtered = time.time() logger.info(f"Time for filtering: {time_filtered - time_start} s") logger.info(f"Assigning UMIs to DBS clusters") with dnaio.open(args.dbsfile, mode="r") as file: dbs_umis = assign_to_dbs(file, umis) logger.info(f"DBS clusters linked to ABC: {len(dbs_umis)}") time_assign = time.time() logger.info(f"Time for assigning clusters: {time_assign - time_filtered} s") logger.info(f"Starting clustering of UMIs within clusters.") # Set clustering method # Based on https://umi-tools.readthedocs.io/en/latest/API.html clusterer = UMIClusterer(cluster_method='directional') with dnaio.open(args.output, fileformat="fasta", mode="w") as output: for dbs, umis in dbs_umis.items(): # Encode each UMI for UMITools and perpare counts counts = {bytes(umi, encoding='utf-8'): len(reads) for umi, reads in umis.items()} stats["Total UMIs"] += len(counts) # Cluster umis clustered_umis = clusterer(counts, threshold=args.threshold) stats["Total clustered UMIs"] += len(clustered_umis) # Loop over clusters and write reads with corrected UMI. for cluster in clustered_umis: seqs = [seq.decode("utf-8") for seq in cluster] canonical_sequnce = seqs[0] for seq in seqs: for read_name in umis[seq]: read = dnaio.Sequence(read_name, canonical_sequnce) output.write(read) time_end = time.time() logger.info(f"Time for clustering: {time_end - time_assign} s") logger.info(f"Total time to run: {time_end - time_start} s") # Send stats to log logger.info(f"Reads filtered out: {stats['Reads filtered out']:,}") logger.info(f"Reads kept: {stats['Reads kept']}") logger.info(f"Total UMIs: {stats['Total UMIs']}") logger.info(f"Total clustered UMIs: {stats['Total clustered UMIs']}")
def split_fastq_reads(fastq_path, output_path, trim_b=0, size_l=40, size_r=40, size_m=30): """ Split reads in the fastq file into three parts for remapping. Depending on the read length, reads may be 1) skipped, 2) split into left and right parts, 3) split into left, right and middle parts left size size_l (name with -l suffix) right size size_r (name with -r suffix) middle size size_m (name with -m suffix) Parameters ---------- fastq_path output_path trim_b size_l size_r size_m Returns ------- """ trim_b = int(trim_b) size_max = max(size_l, size_r) with dnaio.open(fastq_path) as f, \ dnaio.open(output_path, mode='w') as out_f: for read in f: if trim_b > 0: read = read[trim_b:-trim_b] read_length = len(read) if read_length <= size_max: continue else: # split reads to left and right part, they may have overlap left_read = read[:size_l] left_read.name += '-l' out_f.write(left_read) right_read = read[-size_r:] right_read.name += '-r' out_f.write(right_read) # if the middle part is longer enough, we also use it if read_length >= (size_l + size_r + size_m): middle_read = read[size_l:-size_r] middle_read.name += '-m' out_f.write(middle_read) return
def test_autodetect_fastq_weird_name(self): path = os.path.join(self._tmpdir, 'tmp.fastq.gz') with dnaio.open(path, mode='w') as f: assert isinstance(f, FastqWriter) for seq in simple_fastq: f.write(seq) weird_path = os.path.join(self._tmpdir, 'tmp.weird.gz') os.rename(path, weird_path) with dnaio.open(weird_path) as f: assert list(f) == simple_fastq
def test_append(tmp_path, fileformat, extension): s1 = dnaio.SequenceRecord("s1", "ACGT", "HHHH") s2 = dnaio.SequenceRecord("s2", "CGCA", "8383") path = tmp_path / ("out." + fileformat + extension) with dnaio.open(path, mode="w") as f: f.write(s1) with dnaio.open(path, mode="a") as f: f.write(s2) with xopen(path) as f: assert formatted_sequences([s1, s2], fileformat) == f.read()
def main(args): logger.info(f'Starting') progress = BLR.ProgressReporter('Read pairs processed', 1000000) input_interleaved = True if not args.input2 else False logger.info( f"Input detected as {'interleaved fastq.' if input_interleaved else 'paired fastq.'}" ) # If no output1 is given output is sent to stdout if not args.output1: logger.info(f"Writing output to stdout.") args.output1 = sys.stdout.buffer args.output2 = None output_interleaved = True if not args.output2 else False logger.info( f"Output detected as {'interleaved fastq.' if output_interleaved else 'paired fastq.'}" ) reader = dnaio.open(args.input1, file2=args.input2, interleaved=input_interleaved, mode="r", fileformat="fastq") writer = dnaio.open(args.output1, file2=args.output2, interleaved=output_interleaved, mode="w", fileformat="fastq") for read1, read2 in reader: # Adjusting for BC bc_seq = read1.sequence[:20] read1.sequence = read1.sequence[20:] read1.qualities = read1.qualities[20:] # Header parsing name_and_pos_r1, read_and_index_r1 = read1.name.split(maxsplit=1) name_and_pos_r2, read_and_index_r2 = read2.name.split(maxsplit=1) # Save header to read instances read1.name = name_and_pos_r1 + '_' + bc_seq + ' ' + read_and_index_r1 read2.name = name_and_pos_r2 + '_' + bc_seq + ' ' + read_and_index_r2 # Write to out writer.write(read1, read2) # Progress reporting progress.update() reader.close() writer.close() logger.info(f'Finished')
def main(args): if args.rename_from: with dnaio.open(args.rename_from) as fr: template = PrefixDict([]) for record in fr: try: template.add(record.sequence.upper(), record.name) except ValueError: logger.error('Sequences in entry %r and %r are duplicate', record.name, template[record.sequence.upper()]) logger.info('Read %d entries from template', len(template)) else: template = None if args.order_by: with dnaio.open(args.order_by) as fr: gene_order = [gene_name(r) for r in fr] else: gene_order = None with dnaio.open(args.target) as fr: sequences = list(fr) # Rename renamed = 0 if template is not None: for record in sequences: name = template.get(record.sequence.upper()) if name is None: name = record.name + args.not_found else: renamed += 1 # Replace record’s name, leaving comment intact record_name, _, record_comment = record.name.partition(' ') if record_comment: record.name = name + ' ' + record_comment else: record.name = name # Reorder if gene_order: try: sequences = sorted_by_gene(sequences, gene_order) except GeneMissing as e: logger.error('Gene "%s" not found in the --order-by template file', e) sys.exit(1) elif args.sort: sequences = sorted(sequences, key=lambda s: natural_sort_key(s.name)) for record in sequences: print('>{}\n{}'.format(record.name, record.sequence)) logger.info('Wrote %s FASTA records (%d sequences found in template)', len(sequences), renamed)
def _open_raise_limit(path, qualities): """ Open a FASTA/FASTQ file for writing. If it fails because the number of open files would be exceeded, try to raise the soft limit and re-try. """ try: f = dnaio.open(path, mode="w", qualities=qualities) except OSError as e: if e.errno == errno.EMFILE: # Too many open files raise_open_files_limit(8) f = dnaio.open(path, mode="w", qualities=qualities) else: raise return f
def generate_modified_fastq(read1_file, read2_file, cb_file, read1_coords, modified_read_file, num_mismatches=1, num_n_threshold=3): """Matches cell barcodes and generates modified fastq file.""" cell_barcodes = [ i.rstrip().split('-')[0] for i in open_by_suffix(cb_file, mode='r') ] cb_index = create_index(barcodes=cell_barcodes, num_mismatches=num_mismatches) read_counter = [int(), int()] with dnaio.open(file1=read1_file, file2=read2_file, fileformat='fastq', mode='r') as f, dnaio.open(file1=modified_read_file, fileformat='fastq', mode='w') as f_out: for rec in f: read_counter[1] += 1 read1, read2 = rec reads = (read1.name, read1.sequence, read1.qualities, read2.sequence, read2.qualities) out = match_cell_barcodes(reads=reads, barcode_index=cb_index, read_coords=read1_coords, num_mismatches=num_mismatches, num_n_threshold=num_n_threshold) if out: read_counter[0] += 1 read_name, read1_seq, _, read2_seq, read2_qual, bc, dist = out read_info = '#'.join([read1_seq, bc, str(dist)]) read_name = ' '.join( [read_name.split(' ')[0], 'RI:Z:' + read_info]) s2 = dnaio.Sequence(read_name, read2_seq, read2_qual) f_out.write(s2) return modified_read_file, read_counter
def length_histogram(path): """Return a list of lengths """ lengths = [] with dnaio.open(path) as reader: for record in reader: lengths.append(len(record.sequence)) return lengths
def get_sequence(read1_file, read2_file, read1_coords=read1_coords, read2_coords=read2_coords): """Gets sequences.""" with dnaio.open(file1=read1_file, file2=read2_file, fileformat='fastq', mode='r') as f: for rec in f: read1, read2 = rec read1_seq = read1.sequence read2_seq = read2.sequence if read1_coords: r1_start, r1_end = read1_coords r1 = read1_seq[r1_start: min(r1_end, len(read1_seq))] else: r1 = read1_seq if read2_coords: r2_start, r2_end = read2_coords r2 = read2_seq[r2_start: min(r2_end, len(read2_seq))] else: r2 = read2_seq yield r1, r2, read1_seq, read2_seq
def main(): args = get_arguments() bc_dict = dict() with dnaio.open(args.fasta, mode="r") as file: for read in file: bc_id, bc_count, bc_seq = read.name.strip('>').split(':') bc_dict[bc_seq] = int(bc_count) # Based on https://umi-tools.readthedocs.io/en/latest/API.html clusterer = UMIClusterer(cluster_method='directional') start = time.time() clustered_bcs = clusterer(bc_dict, threshold=1) end = time.time() cluster_lens = [len(c) for c in clustered_bcs] count = Counter(cluster_lens) count = sorted(list(count.items())) print(f"Cluster size, Frequency") for bcs, frequency in count: print(f"{bcs:12}, {frequency:9}") print(f'Time to run: {end-start} s') print(f'Length data: {len(bc_dict)}')
def _open_writer(self, file, file2=None, force_fasta=None, **kwargs): # TODO backwards-incompatible change (?) would be to use outfiles.interleaved # for all outputs if force_fasta: kwargs['fileformat'] = 'fasta' return dnaio.open(file, file2=file2, mode='w', qualities=self.uses_qualities, **kwargs)
def build_index(self, size=None): """Builds an dictionary to index the reads. Returns (dict): If size is None, A dictionary, where, key: An identifier of the read pair, this is the first part of the identifier line up to the first space. value: A 2-tuple of read sequence. """ fastq1_dict = dict() counter = 0 with dnaio.open(self.r1, file2=self.r2) as fastq1: for read1, read2 in fastq1: read_pair = ReadPair(read1, read2) ident = read_pair.identifier fastq1_dict[ident] = (read_pair.read1.sequence, read_pair.read2.sequence) counter += 1 if size and len(fastq1_dict.keys()) >= size: print("%d reads indexed" % counter) counter = 0 yield fastq1_dict # Clear the dictionary so that it will take the next chunk. fastq1_dict.clear() gc.collect() print("%d reads indexed" % counter) yield fastq1_dict
def _read_fasta(path): records = [] with dnaio.open(path) as sr: for record in sr: record.name = record.name.split(maxsplit=1)[0] records.append(record) return records
def test_write(tmpdir, extension): s = dnaio.Sequence('name', 'ACGT', 'HHHH') out_fastq = tmpdir.join("out.fastq" + extension) with dnaio.open(str(out_fastq), mode='w') as f: f.write(s) with xopen(out_fastq) as f: assert f.read() == '@name\nACGT\n+\nHHHH\n'
def test_non_ascii_in_record(self): # \xc4 -> Ä fastq = BytesIO(b'@r1\n\xc4\n+\nH') with pytest.raises(FastqFormatError) as e: with dnaio.open(fastq) as f: list(f) e.match("Non-ASCII")
def validate_fasta(path): """ Ensure that the FASTA file is suitable for use with makeblastdb. Raise a FastaValidationError if any of the following are true: - a record is empty - a record name occurs more than once - a sequence occurs more than once """ with dnaio.open(path) as sr: records = list(sr) names = set() sequences = dict() for r in records: if len(r.sequence) == 0: raise FastaValidationError("Record {!r} is empty".format(r.name)) if r.name in names: raise FastaValidationError("Record name {!r} occurs more than once".format(r.name)) s = r.sequence.upper() if s in sequences: raise FastaValidationError("Records {!r} and {!r} contain the same sequence".format( r.name, sequences[s])) sequences[s] = r.name names.add(r.name)
def main(args): with dnaio.open(args.fasta) as fr: sequences = list(fr) logger.info('Plotting dendrogram of %s sequences', len(sequences)) if args.mark: with dnaio.open(args.mark) as fr: mark = PrefixComparer(record.sequence for record in fr) labels = [] n_new = 0 for record in sequences: if record.sequence not in mark: extra = ' (new)' n_new += 1 else: extra = '' labels.append(record.name + extra) logger.info('%s sequence(s) marked as "new"', n_new) else: labels = [s.name for s in sequences] import seaborn as sns import matplotlib.pyplot as plt sns.set() sns.set_style("white") font_size = 297 / 25.4 * 72 / (len(labels) + 5) font_size = min(16, max(6, font_size)) height = font_size * (len(labels) + 5) / 72 fig = plt.figure(figsize=(210 / 25.4, height)) matplotlib.rcParams.update({'font.size': 4}) ax = fig.gca() sns.despine(ax=ax, top=True, right=True, left=True, bottom=True) sns.set_style('whitegrid') if len(sequences) >= 2: m = distances([s.sequence for s in sequences]) y = distance.squareform(m) mindist = int(y.min()) logger.info('Smallest distance is %s. Found between:', mindist) for i,j in np.argwhere(m == y.min()): if i < j: logger.info('%s and %s', labels[i], labels[j]) l = hierarchy.linkage(y, method=args.method) hierarchy.dendrogram(l, labels=labels, leaf_font_size=font_size, orientation='right', color_threshold=0.95*max(l[:,2])) else: ax.text(0.5, 0.5, 'no sequences', fontsize='xx-large') ax.grid(False) fig.set_tight_layout(True) fig.savefig(args.plot)
def main(args): barcode_length = args.barcode_length too_short = 0 n = 0 sequences = defaultdict( list) # maps sequences to a list of Sequence objects containing them with dnaio.open(args.fastx) as f: for record in islice(f, 0, args.limit): n += 1 if len(record) < args.minimum_length: too_short += 1 continue sequences[record.sequence].append(record) n_written = 0 for records in sequences.values(): # If there are multiple records with the same sequence, pick the first record = records[0] if barcode_length >= 0: barcode = record.sequence[:barcode_length] unbarcoded = record[barcode_length:] else: barcode = record.sequence[barcode_length:] unbarcoded = record[:barcode_length] if args.trim_g: # The RACE protocol leads to a run of non-template Gs in the beginning # of the sequence, after the barcode. unbarcoded.sequence = unbarcoded.sequence.lstrip('G') if unbarcoded.qualities: unbarcoded.qualities = unbarcoded.qualities[-len(unbarcoded. sequence):] name = record.name.split(maxsplit=1)[0] if name.endswith(';'): name = name[:-1] if barcode_length: print('>{};barcode={};size={};\n{}'.format(name, barcode, len(records), unbarcoded.sequence)) else: print('>{};size={};\n{}'.format(name, len(records), unbarcoded.sequence)) n_written += 1 logger.info('%s sequences processed', n) logger.info('%s sequences long enough', n - too_short) logger.info('%s dereplicated sequences written', n_written) if args.json: stats = { 'groups_written': n_written, } with open(args.json, 'w') as f: json.dump(stats, f, indent=2) print(file=f)
def _open_writer(self, file, file2, **kwargs): # TODO backwards-incompatible change (?) would be to use outfiles.interleaved # for all outputs return dnaio.open(file, file2=file2, mode='w', qualities=self.uses_qualities, **kwargs)
def main(args): config = GlobalConfig() use_cache = config.use_cache if args.cache is not None: use_cache = args.cache if use_cache: global _igblastcache _igblastcache = IgBlastCache() logger.info('IgBLAST cache enabled') if args.threads == 0: args.threads = available_cpu_count() logger.info("Running IgBLAST on database sequences to find CDR/FR region locations") database = Database(args.database, args.sequence_type) logger.info("Running IgBLAST on input reads") detected_cdr3s = 0 writer = TableWriter(sys.stdout) start_time = time.time() last_status_update = 0 with ExitStack() as stack: if args.raw: raw_output = stack.enter_context(xopen(args.raw, 'w')) else: raw_output = None sequences = stack.enter_context(dnaio.open(args.fasta)) sequences = islice(sequences, 0, args.limit) n = 0 # number of records processed so far for record in igblast(database, sequences, sequence_type=args.sequence_type, species=args.species, threads=args.threads, penalty=args.penalty, raw_output=raw_output, use_cache=use_cache): n += 1 if args.rename is not None: record.query_name = "{}seq{}".format(args.rename, n) d = record.asdict() if d['CDR3_aa']: detected_cdr3s += 1 try: writer.write(d) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise if n % 1000 == 0: elapsed = time.time() - start_time if elapsed >= last_status_update + 60: logger.info( 'Processed {:10,d} sequences at {:.3f} ms/sequence'.format(n, elapsed / n * 1E3)) last_status_update = elapsed elapsed = time.time() - start_time logger.info('Processed {:10,d} sequences at {:.1f} ms/sequence'.format(n, elapsed / n * 1E3)) logger.info('%d IgBLAST assignments parsed and written', n) logger.info('CDR3s detected in %.1f%% of all sequences', detected_cdr3s / n * 100) if args.stats: stats = {'total': n, 'detected_cdr3s': detected_cdr3s} with open(args.stats, 'w') as f: json.dump(stats, f) print(file=f)
def main(args): logger.info(f"Starting analysis") logger.info(f"Processing file: {args.err_corr}") if os.stat(args.err_corr).st_size == 0: logging.warning(f"File {args.err_corr} is empty.") err_corr = dict() clusters = set() with open(args.err_corr, "r") as file: for line in tqdm(file): try: cluster_seq, num_reads, raw_seqs_list = line.split() except ValueError: logging.warning(f"Non-default starcode output line: {line}") continue clusters.add(cluster_seq) for raw_seq in raw_seqs_list.split(","): if raw_seq not in err_corr: err_corr[raw_seq] = cluster_seq logger.info(f"Clusters: {len(clusters)}") logger.info(f"Error corrected sequenced parsed.") logger.info(f"Correcting sequences and writing to output file.") counter = Counter() with dnaio.open(args.raw_fastq, mode="r", fileformat="fastq") as reader, \ dnaio.open(args.corr_fasta, mode="w", fileformat="fasta") as openout: for read in tqdm(reader): counter['tot_reads'] += 1 if read.sequence in err_corr: read.sequence = err_corr[read.sequence] openout.write(read) counter['corr_seqs'] += 1 else: counter['no_err_corr_seq'] += 1 logger.info(f"Reads total: {counter['tot_reads']:,}") logger.info(f"Reads corrected: {counter['corr_seqs']:,}") logger.info(f"Reads without corrected seq: {counter['no_err_corr_seq']:,}") logger.info(f"Finished")
def _get_sequence(read_file): """Gets sequences.""" with dnaio.open(file1=read_file, file2=None, fileformat='fastq', mode='r') as f: for read in f: yield read.sequence, read.qualities
def _open_writer( self, file: BinaryIO, file2: Optional[BinaryIO] = None, force_fasta: Optional[bool] = None, ): assert file2 is None assert not isinstance(file, (str, bytes, Path)) return dnaio.open( file, mode="w", qualities=self.uses_qualities, fileformat="fasta" if force_fasta else None)
def get_identifier(fastq_file): """Gets the identifier of the first read in a FASTQ file. Args: fastq_file: The full path of a FASTQ file. """ with dnaio.open(fastq_file) as f: for read in f: return read.name.split(" ", 1)[0].split("/")[0]
def __call__(self, read, matches): """ Write the read to the proper output file according to the most recent match """ if matches: name = matches[-1].adapter.name if name not in self.writers: self.writers[name] = dnaio.open(self.template.replace('{name}', name), mode='w', qualities=self.qualities) self.written += 1 self.written_bp[0] += len(read) self.writers[name].write(read) else: if self.untrimmed_writer is None and self.untrimmed_path is not None: self.untrimmed_writer = dnaio.open(self.untrimmed_path, mode='w', qualities=self.qualities) if self.untrimmed_writer is not None: self.written += 1 self.written_bp[0] += len(read) self.untrimmed_writer.write(read) return DISCARD
def _open_writer(self, file, file2, **kwargs): # TODO backwards-incompatible change (?) would be to use outfiles.interleaved # for all outputs return dnaio.open(file, file2=file2, mode='w', qualities=self.uses_qualities, **kwargs)
def set_input(self, infiles: InputFiles): self._reader = dnaio.open(infiles.file1, file2=infiles.file2, interleaved=infiles.interleaved, mode='r')