def test_write_fastq2(fastq_file, tmpdir): header, seq, qual = next(fastq.load_fastq(fastq_file, num_qual=True)) file_name = (tmpdir / 'test.fq').strpath file_handle = open_file(file_name, 'w') fastq.write_fastq_sequence(file_handle, header, seq, qual) file_handle.close() headerw, seqw, qualw = next(fastq.load_fastq(file_name, num_qual=True)) assert (header, seq, list(qual)) == (headerw, seqw, list(qualw))
def infer_parameters(file_handle, fastq_bool, progress): LOG.info("Extrapolating model from file %s", file_handle.name) if fastq_bool: it = load_fastq(file_handle, num_qual=True) quals = [] else: it = fasta.load_fasta(file_handle) if progress: it = tqdm(it) gc_content = [] length = 0 for record in it: length = max(length, len(record[1])) gc_content.append(sequence.sequence_gc_content(record[1])) if fastq_bool: quals.append(record[2]) if fastq_bool: model = sequence.extrapolate_model(quals) else: model = None gc_content = numpy.mean(gc_content) return length, gc_content, model
def convert_command(verbose, fastq_file, fasta_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info("Writing FASTA file (%s)", getattr(fasta_file, 'name', repr(fasta_file))) for seq_id, seq, qual in load_fastq(fastq_file): fasta.write_fasta_sequence(fasta_file, seq_id, seq)
def test_load_fastq4(fastq_file): header, seq, qual = next(fastq.load_fastq(fastq_file, num_qual=True)) assert list(qual) == [ 24, 34, 26, 28, 26, 28, 27, 24, 31, 19, 23, 21, 23, 29, 24, 25, 21, 22, 32, 32, 27, 24, 29, 21, 20, 27, 28, 29, 20, 24, 16 ]
def fq_sync_command(verbose, master_file, input_file, output_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) master_file = load_fastq(master_file, num_qual=False) master_header = next(master_file)[0] header_type = choose_header_type(master_header) written_count = 0 for header, seq, qual in load_fastq(input_file, num_qual=False): if compare_header(master_header, header, header_type): write_fastq_sequence(output_file, header, seq, qual) written_count += 1 try: master_header = next(master_file)[0] except StopIteration: break LOG.info("Wrote %d FASTQ sequences", written_count)
def test_load_fastq4(fastq_file): header, seq, qual = next(fastq.load_fastq(fastq_file)) assert qual == '9C;=;=<9@4868>9:67AA<9>65<=>591'
def test_load_fastq3(fastq_file): header, seq, qual = next(fastq.load_fastq(fastq_file)) assert header == 'cluster_2:UMI_ATTCCG'
def test_load_fastq2(fastq_file): header, seq, qual = next(fastq.load_fastq(fastq_file)) assert seq == 'TTTCCGGGGCACATAATCTTCAGCCGGGCGC'
def test_load_fastq1(fastq_file): assert sum(1 for record in fastq.load_fastq(fastq_file)) == 250
def sort(verbose, mate1_input, mate2_input, mate1_output, mate2_output): "Sort two fastq files" mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info('Writing [mate1-output] to file (%s)', getattr(mate1_output, 'name', repr(mate1_output))) LOG.info('Writing [mate2-output] to file (%s)', getattr(mate2_output, 'name', repr(mate2_output))) regex = None simple_header = False mate1 = {} mate2 = {} count = 0 wcount = 0 for (seq_id1, seq1, qual1), (seq_id2, seq2, qual2) in zip(load_fastq(mate1_input), load_fastq(mate2_input)): count += 1 if (regex is None) and (not simple_header): regex = choose_header_type(seq_id1) if regex is None: simple_header = True LOG.info("Using a simple header structure") if simple_header: key1 = seq_id1[:-1] key2 = seq_id2[:-1] else: match1 = regex.search(seq_id1) match2 = regex.search(seq_id2) key1 = (match1.group('lane'), match1.group('tile'), match1.group('xcoord'), match1.group('ycoord')) key2 = (match2.group('lane'), match2.group('tile'), match2.group('xcoord'), match2.group('ycoord')) seq1 = (seq_id1, seq1, qual1) seq2 = (seq_id2, seq2, qual2) if key1 == key2: # if the 2 write_fastq_sequence(mate1_output, *seq1) write_fastq_sequence(mate2_output, *seq2) wcount += 1 report_counts(count, wcount, count) continue mate1[key1] = seq1 mate2[key2] = seq2 if key1 in mate2: write_fastq_sequence(mate1_output, *mate1[key1]) write_fastq_sequence(mate2_output, *mate2[key1]) del mate1[key1] del mate2[key1] wcount += 1 if key2 in mate1: write_fastq_sequence(mate1_output, *mate1[key2]) write_fastq_sequence(mate2_output, *mate2[key2]) del mate1[key2] del mate2[key2] wcount += 1 report_counts(count, wcount, count) report_counts(count, wcount, None)
def deinterleave(verbose, strip, fastq_file, mate1_file, mate2_file): "Deinterleave a fastq file" mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info('Writing [mate1-file] to file (%s)', getattr(mate1_file, 'name', repr(mate1_file))) LOG.info('Writing [mate2-file] to file (%s)', getattr(mate2_file, 'name', repr(mate2_file))) regex = None simple_header = False mate1 = {} mate2 = {} count = 0 wcount = 0 for seq_id, seq, qual in load_fastq(fastq_file): count += 1 if (regex is None) and (not simple_header): regex = choose_header_type(seq_id) if regex is None: LOG.info("Using a simple header structure") simple_header = True if simple_header: key = seq_id[:-1] mate = int(seq_id[-1]) else: match = regex.search(seq_id) key = (match.group('lane'), match.group('tile'), match.group('xcoord'), match.group('ycoord')) mate = int(match.group('mate')) if strip: sequence_name = seq_id.split('\t')[0] else: sequence_name = seq_id if mate == 1: mate1[key] = (sequence_name, seq, qual) else: mate2[key] = (sequence_name, seq, qual) try: # if sequence header in both seq1 = mate1[key] seq2 = mate2[key] write_fastq_sequence(mate1_file, *seq1) write_fastq_sequence(mate2_file, *seq2) wcount += 2 del mate1[key] del mate2[key] except KeyError: pass report_counts(count, wcount, count) report_counts(count, wcount, None)