def _process_bam_reads(observed_reads, references, position, err_func): for records_and_filenames in observed_reads.itervalues(): if len(records_and_filenames) == 1: # Most read-names should be obseved at most once at a position continue result = collections.defaultdict(list) for record, filename in records_and_filenames: key = (record.is_reverse, record.qname, record.seq, record.qual) result[key].append((filename, record)) for (is_reverse, name, seq, qual), filenames in result.iteritems(): if len(filenames) == 1: # Two reads had same name, but different characterstics continue records = collections.defaultdict(list) for filename, record in filenames: records[filename].append(record) if is_reverse: seq = reverse_complement(seq) qual = qual[::-1] chrom = references[position[1]] pos = position[0] err_func(chrom, pos, records, name, seq, qual)
def _process_bam_reads(observed_reads, references, position, err_func): for records_and_filenames in observed_reads.values(): if len(records_and_filenames) == 1: # Most read-names should be obseved at most once at a position continue result = collections.defaultdict(list) for record, filename in records_and_filenames: key = (record.is_reverse, record.qname, record.seq, record.qual) result[key].append((filename, record)) for (is_reverse, name, seq, qual), filenames in result.items(): if len(filenames) == 1: # Two reads had same name, but different characterstics continue records = collections.defaultdict(list) for filename, record in filenames: records[filename].append(record) if is_reverse: seq = reverse_complement(seq) qual = qual[::-1] chrom = references[position[1]] pos = position[0] err_func(chrom, pos, records, name, seq, qual)
def write_records(self, records): for record in records: seq = record.seq qual = record.qual if record.is_reverse: seq = reverse_complement(seq) qual = qual[::-1] assert len(qual) == len(seq), record.qname self._handle.write("@%s\n" % (record.qname,)) self._handle.write("%s\n" % (seq,)) self._handle.write("+\n") self._handle.write("%s\n" % (qual,))
def _generate_reads(cls, options, rng, sample, minimum, pcr1): reads = [] while len(reads) < minimum: name, sequence = sample.get_fragment() cur_forward = sequence + pcr1 cur_reverse = reverse_complement(sequence) + PCR2 # Number of PCR copies -- minimum 1 num_dupes = toint(_rexp(options.library_pcr_lambda, rng)) + 1 for dupe_id in xrange(num_dupes): cur_name = "%s_%s" % (name, dupe_id) reads.append((cur_name, cur_forward, cur_reverse)) random.shuffle(reads) return reads
def _get_endogenous_sequence(self): length = self._get_frag_len() max_position = len(self._specimen.sequence) - length position = self._random.randint(0, max_position) strand = self._random.choice(("fw", "rv")) sequence = self._specimen.sequence[position:position + length] real_pos = self._specimen.positions[position] if strand == "rv": sequence = reverse_complement("".join(sequence)) self._endog_id += 1 name = "Seq_%i_%i_%i_%s" % (self._endog_id, real_pos, length, strand) return (True, name, sequence)
def _process_reads(cls, observed_reads, output_files): for records_and_filenames in observed_reads.itervalues(): if len(records_and_filenames) == 1: # Most read-names should be obseved at most once at a position continue result = collections.defaultdict(list) for record, filename in records_and_filenames: key = (record.is_reverse, record.qname, record.seq, record.qual) result[key].append(filename) for (is_reverse, name, seq, qual), filenames in result.iteritems(): if len(filenames) == 1: # Two reads had same name, but different characterstics continue filename_counts = collections.defaultdict(int) for filename in filenames: filename_counts[filename] += 1 if is_reverse: seq = reverse_complement(seq) qual = qual[::-1] message = ["The same read was found multiple times!", " Name: %r" % (name,), " Sequence: %r" % (seq,), " Qualities: %r" % (qual,), ""] message.append("Read was found") for filename, count in sorted(filename_counts.iteritems()): message.append(" % 2ix in %r" % (count, filename)) message.append("") message.append("This indicates that the same data files have " "been included multiple times in the project. " "Please review the input files used in this " "project, to ensure that each set of data is " "included only once!\n\n" "If this is not the case, then execute the " "following command(s) to mark this test as " "having succeeded:") for fpath in output_files: message.append("$ touch '%s'" % (fpath,)) raise NodeError("\n".join(message))
def _collect_sequence(cls, fastafile, beds): sequence = [] for bed in beds: fragment = fastafile.fetch(bed.contig, bed.start, bed.end) if len(fragment) != (bed.end - bed.start): cls._report_failure(bed, fragment) sequence.append(fragment) sequence = "".join(sequence) if any((bed.strand == "-") for bed in beds): assert all((bed.strand == "-") for bed in beds) sequence = sequtils.reverse_complement(sequence) return sequence
def build_genes(options, genotype, regions): def keyfunc(bed): return (bed.contig, bed.name, bed.start) regions.sort(key=keyfunc) for (gene, beds) in itertools.groupby(regions, lambda x: x.name): sequence, beds = [], tuple(beds) for bed in beds: sequence.extend(build_region(options, genotype, bed)) sequence = "".join(sequence) if any((bed.strand == "-") for bed in beds): assert all((bed.strand == "-") for bed in beds) sequence = sequences.reverse_complement(sequence) yield (gene, sequence)
def _validate_makefile_adapters(makefile): """Checks for the default adapter sequences specified in the wrong orientation for AdapterRemoval, which is a typical mistake when using the --pcr2 option. """ # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" tests = { # --pcr2 expects the reverse complement of the mate 2 adapter seq. "--pcr2": adapter_2, # --adapter2 (AdapterRemoval v2) expects the regular sequence "--adapter2": sequences.reverse_complement(adapter_2) } def check_options(options, results): for key, value in tests.iteritems(): if options.get(key) == value: results[key] = True results = dict.fromkeys(tests, False) for (_, _, _, _, record) in _iterate_over_records(makefile): adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) if any(results.itervalues()): print_warn( "WARNING: An adapter specified for AdapterRemoval " "corresponds to the default sequence, but is reverse " "complemented. Please make sure that this is intended! ", end="") if results["--pcr2"]: print_warn("For --pcr2, the sequence given should be the " "reverse complement of the sequence observed in the " "mate 2 FASTQ file.\n") if results["--adapter2"]: print_warn("For --adapter2 (AdapterRemoval v2, only) the value " "should be exactly as observed in the FASTQ reads.\n")
def _validate_makefile_adapters(makefile): """Checks for the default adapter sequences specified in the wrong orientation for AdapterRemoval, which is a typical mistake when using the --pcr2 option. """ # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" tests = { # --pcr2 expects the reverse complement of the mate 2 adapter seq. "--pcr2": adapter_2, # --adapter2 (AdapterRemoval v2) expects the regular sequence "--adapter2": sequences.reverse_complement(adapter_2) } def check_options(options, results): for key, value in tests.iteritems(): if options.get(key) == value: results[key] = True results = dict.fromkeys(tests, False) for (_, _, _, _, record) in _iterate_over_records(makefile): adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) if any(results.itervalues()): print_warn("WARNING: An adapter specified for AdapterRemoval " "corresponds to the default sequence, but is reverse " "complemented. Please make sure that this is intended! ", end="") if results["--pcr2"]: print_warn("For --pcr2, the sequence given should be the " "reverse complement of the sequence observed in the " "mate 2 FASTQ file.\n") if results["--adapter2"]: print_warn("For --adapter2 (AdapterRemoval v2, only) the value " "should be exactly as observed in the FASTQ reads.\n")
def build_regions(options, genotype, beds, reverse_compl): for bed in beds: sequence = build_region(options, genotype, bed) if reverse_compl: sequence = sequences.reverse_complement(sequence) yield sequence
def test_reverse_complement(): assert_equal(reverse_complement(_REF_SRC), _REF_DST[::-1])
def test_reverse_complement(): assert reverse_complement(_REF_SRC) == _REF_DST[::-1]