def _offline_counter(args): """ Offline counting from SAM/BAM file. """ # Offline counting from SAM/BAM file: counts = read_counter.count_reads(args.bam.name, in_format=args.f, min_aln_qual=args.a, verbose=not args.Q) counts = OrderedDict(six.iteritems(counts)) calc_words = [int(k) for k in args.k.split(",")] data = OrderedDict() # Calculate sequence properties: if args.z is not None: lengths, gc_contents, word_freqs = {}, {}, defaultdict( lambda: defaultdict(dict)) ref_iter = seq_util.read_seq_records(args.z) if not args.Q: sys.stderr.write("Calculating sequence features:\n") ref_iter = tqdm.tqdm(ref_iter) for ref in ref_iter: # Augment counts dictionary with missing reference entries: if ref.id not in counts: counts[ref.id] = 0 lengths[ref.id] = len(ref) gc_contents[ref.id] = seq_util.gc_content(str(ref.seq)) if args.k is not None: for word_size in calc_words: bf = seq_util.word_composition(ref.seq, word_size) for word, count in six.iteritems(bf): word_freqs[word_size][ ref.id][word] = float(count) / len(ref) data['Length'] = [lengths[tr] for tr in six.iterkeys(counts)] data['GC_content'] = [gc_contents[tr] for tr in six.iterkeys(counts)] data['Reference'] = list(counts.keys()) data['Count'] = list(counts.values()) # Calculate word frequencies: if args.k is not None and args.z: for ks in calc_words: for word in next(iter((word_freqs[ks].values()))).keys(): tmp = [] for ref in counts.keys(): tmp.append(word_freqs[ks][ref][word]) data[word] = tmp data_frame = pd.DataFrame(data) data_frame = data_frame.sort_values(['Count', 'Reference'], ascending=False) if args.t is not None: data_frame.to_csv(args.t, sep='\t', index=False) if args.p is not None: misc.pickle_dump(data, args.p)
def _get_lengths(in_file, in_format, min_length, max_length, do_log): """ Iterate over input and accumulate sequence lengths. """ input_iterator = seq_util.read_seq_records(in_file, format=in_format) lengths = [] for record in input_iterator: length = len(record) # Filter for minimum read length: if (min_length is not None) and (length < min_length): continue # Filter for maximum read length: if (max_length is not None) and (length > max_length): continue if do_log: length = np.log(length) lengths.append(length) input_iterator.close() return lengths
parser.add_argument('-p', metavar='results_pickle', type=str, help="Save pickled results in this file.", default=None) parser.add_argument('input_fastx', nargs='?', help='Input (default: stdin).', type=argparse.FileType('r'), default=sys.stdin) if __name__ == '__main__': args = parser.parse_args() in_format = args.f input_iterator = seq_util.read_seq_records(args.input_fastx, format=in_format) total_bases = 0 for record in input_iterator: total_bases += len(record) results = {'total_bases': total_bases} print("Total bases\t{}".format(total_bases)) if args.s is not None: results['genome_size'] = args.s results['coverage'] = float(total_bases) / args.s print("Genome size\t{}".format(results['genome_size'])) print("Coverage\t{}".format(results['coverage'])) if args.p is not None: misc.pickle_dump(results, args.p)
from wub.util import seq as seq_util # Parse command line arguments: parser = argparse.ArgumentParser( description='Reverse (but not complement!) sequences and qualities in fastq file.') parser.add_argument('input_fastq', nargs='?', help='Input fastq (default: stdin).', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('output_fastq', nargs='?', help='Output fastq (default: stdout)', type=argparse.FileType('w'), default=sys.stdout) def reverse_seq_records(input_iterator): """Reverse SeqRecord objects. :param input_iterator: Iterator of SeqRecord objects. :returns: Generator of reversed SeqRecord objects. :rtype: generator """ for record in input_iterator: yield record[::-1] if __name__ == '__main__': args = parser.parse_args() input_iterator = seq_util.read_seq_records( args.input_fastq, format='fastq') output_iterator = reverse_seq_records(input_iterator) seq_util.write_seq_records( output_iterator, args.output_fastq, format='fastq')
read_seq = mutated_record.seq if direction == '-': read_seq = seq_util.reverse_complement(mutated_record.seq) yield seq_util.new_dna_record(read_seq, read_name, mock_qualities), sam if __name__ == '__main__': args = parser.parse_args() # Set random seed: if args.z is not None: np.random.seed(args.z) # Read in chromosomes of the input genome: chromosomes = list(seq_util.read_seq_records(args.input_fasta)) # Process error weights: error_weights = np.array(parse_util.separated_list_to_floats(args.w)) # Normalise error weights to probabilities: error_weights = parse_util.normalise_array(error_weights) error_weights = dict( zip(['substitution', 'insertion', 'deletion'], error_weights)) sw = None if args.s is not None: sw = sam_writer.SamWriter(args.s, build_sam_header(chromosomes)) simulation_iterator = simulate_sequencing(chromosomes, args.m, args.a, args.l, args.u, args.e, error_weights, args.b, args.q,
metavar='output_file', type=str, help="Output SAM file.") if __name__ == '__main__': args = parser.parse_args() input_iter = bam_common.pysam_open(args.infile, args.f).fetch(until_eof=True) # Get SAM record names: sam_names = [record.query_name for record in input_iter] writer = sam_writer.SamWriter(args.outfile) for read in seq_util.read_seq_records(args.q, 'fastq'): if read.id not in sam_names: qual = seq_util.quality_array_to_string( read.letter_annotations["phred_quality"]) sam_record = writer.new_sam_record(qname=read.id, flag=4, rname="*", pos=0, mapq=0, cigar="*", rnext="*", pnext=0, tlen=0, seq=str(read.seq), qual=qual, tags="AS:i:0")
parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).', type=argparse.FileType('w'), default=sys.stdout) def _record_filter(input_iter_bait, input_iter_target): """ Filter out SeqRecord objects present in the first iterator. """ bait_ids = [read.id for read in input_iter_bait] for record in input_iter_target: if record.id not in bait_ids: yield record if __name__ == '__main__': args = parser.parse_args() input_iterator_bait = seq_util.read_seq_records(args.input_fastx_bait, format=args.i) input_iterator_target = seq_util.read_seq_records(args.input_fastx_target, format=args.i) output_iterator = _record_filter(input_iterator_bait, input_iterator_target) seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o)