def _run(self, _config, temp): def keyfunc(bed): return (bed.contig, bed.name, bed.start) fastafile = pysam.Fastafile(self._reference) seqs = collections.defaultdict(list) with open(self._intervals) as bedfile: intervals = text.parse_lines_by_contig(bedfile, pysam.asBed()).items() for (contig, beds) in sorted(intervals): beds.sort(key = keyfunc) for (gene, gene_beds) in itertools.groupby(beds, lambda x: x.name): gene_beds = tuple(gene_beds) for bed in gene_beds: seqs[(contig, gene)].append(fastafile.fetch(contig, bed.start, bed.end)) seq = "".join(seqs[(contig, gene)]) if any((bed.strand == "-") for bed in gene_beds): assert all((bed.strand == "-") for bed in gene_beds) seq = sequences.reverse_complement(seq) seqs[(contig, gene)] = seq temp_file = os.path.join(temp, "sequences.fasta") with open(temp_file, "w") as out_file: for ((_, gene), sequence) in sorted(seqs.items()): fasta.print_fasta(gene, sequence, out_file) move_file(temp_file, self._outfile)
def _run(self, _config, temp): def keyfunc(bed): return (bed.contig, bed.name, bed.start) fastafile = pysam.Fastafile(self._reference) seqs = collections.defaultdict(list) with open(self._intervals) as bedfile: intervals = text.parse_lines_by_contig(bedfile, pysam.asBed()).items() for (contig, beds) in sorted(intervals): beds.sort(key=keyfunc) for (gene, gene_beds) in itertools.groupby(beds, lambda x: x.name): gene_beds = tuple(gene_beds) for bed in gene_beds: seqs[(contig, gene)].append( fastafile.fetch(contig, bed.start, bed.end)) seq = "".join(seqs[(contig, gene)]) if any((bed.strand == "-") for bed in gene_beds): assert all((bed.strand == "-") for bed in gene_beds) seq = sequences.reverse_complement(seq) seqs[(contig, gene)] = seq temp_file = os.path.join(temp, "sequences.fasta") with open(temp_file, "w") as out_file: for ((_, gene), sequence) in sorted(seqs.items()): fasta.print_fasta(gene, sequence, out_file) move_file(temp_file, self._outfile)
def _generate_reads(cls, options, rng, sample, minimum, pcr1): reads = [] while len(reads) < minimum: name, sequence = sample.get_fragment() cur_forward = sequence + pcr1 cur_reverse = reverse_complement(PCR2 + sequence) # Number of PCR copies -- minimum 1 num_dupes = toint(_rexp(options.library_pcr_lambda, rng)) + 1 for dupe_id in xrange(num_dupes): cur_name = "%s_%s" % (name, dupe_id) reads.append((cur_name, cur_forward, cur_reverse)) random.shuffle(reads) return reads
def _generate_reads(cls, options, rng, sample, minimum, pcr1): reads = [] while len(reads) < minimum: name, sequence = sample.get_fragment() cur_forward = sequence + pcr1 cur_reverse = reverse_complement(sequence) + PCR2 # Number of PCR copies -- minimum 1 num_dupes = toint(_rexp(options.library_pcr_lambda, rng)) + 1 for dupe_id in xrange(num_dupes): cur_name = "%s_%s" % (name, dupe_id) reads.append((cur_name, cur_forward, cur_reverse)) random.shuffle(reads) return reads
def write_records(self, records): for record in records: seq = record.seq qual = record.qual if record.is_reverse: seq = reverse_complement(seq) qual = qual[::-1] assert len(qual) == len(seq), record.qname self._handle.write("@%s\n" % (record.qname,)) self._handle.write("%s\n" % (seq,)) self._handle.write("+\n") self._handle.write("%s\n" % (qual,))
def write_records(self, records): for record in records: seq = record.seq qual = record.qual if record.is_reverse: seq = reverse_complement(seq) qual = qual[::-1] assert len(qual) == len(seq), record.qname self._handle.write("@%s\n" % (record.qname, )) self._handle.write("%s\n" % (seq, )) self._handle.write("+\n") self._handle.write("%s\n" % (qual, ))
def _get_endogenous_sequence(self): length = self._get_frag_len() max_position = len(self._specimen.sequence) - length position = self._random.randint(0, max_position) strand = self._random.choice(("fw", "rv")) sequence = self._specimen.sequence[position:position + length] real_pos = self._specimen.positions[position] if strand == "rv": sequence = reverse_complement("".join(sequence)) self._endog_id += 1 name = "Seq_%i_%i_%i_%s" % (self._endog_id, real_pos, length, strand) return (True, name, sequence)
def _collect_sequence(cls, fastafile, beds): sequence = [] for bed in beds: fragment = fastafile.fetch(bed.contig, bed.start, bed.end) if len(fragment) != (bed.end - bed.start): cls._report_failure(bed, fragment) sequence.append(fragment) sequence = "".join(sequence) if any((bed.strand == "-") for bed in beds): assert all((bed.strand == "-") for bed in beds) sequence = sequtils.reverse_complement(sequence) return sequence
def build_genes(options, genotype, regions): def keyfunc(bed): return (bed.contig, bed.name, bed.start) regions.sort(key=keyfunc) for (gene, beds) in itertools.groupby(regions, lambda x: x.name): sequence, beds = [], tuple(beds) for bed in beds: sequence.extend(build_region(options, genotype, bed)) sequence = "".join(sequence) if any((bed.strand == "-") for bed in beds): assert all((bed.strand == "-") for bed in beds) sequence = sequences.reverse_complement(sequence) yield (gene, sequence)
def _validate_makefile_adapters(makefile): """Checks for the default adapter sequences specified in the wrong orientation for AdapterRemoval, which is a typical mistake when using the --pcr2 option. """ # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" tests = { # --pcr2 expects the reverse complement of the mate 2 adapter seq. "--pcr2": adapter_2, # --adapter2 (AdapterRemoval v2) expects the regular sequence "--adapter2": sequences.reverse_complement(adapter_2) } def check_options(options, results): for key, value in tests.iteritems(): if options.get(key) == value: results[key] = True results = dict.fromkeys(tests, False) for (_, _, _, _, record) in _iterate_over_records(makefile): adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) if any(results.itervalues()): print_warn( "WARNING: An adapter specified for AdapterRemoval " "corresponds to the default sequence, but is reverse " "complemented. Please make sure that this is intended! ", end="") if results["--pcr2"]: print_warn("For --pcr2, the sequence given should be the " "reverse complement of the sequence observed in the " "mate 2 FASTQ file.\n") if results["--adapter2"]: print_warn("For --adapter2 (AdapterRemoval v2, only) the value " "should be exactly as observed in the FASTQ reads.\n")
def _validate_makefile_adapters(makefile): """Checks for the default adapter sequences specified in the wrong orientation for AdapterRemoval, which is a typical mistake when using the --pcr2 option. """ # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" tests = { # --pcr2 expects the reverse complement of the mate 2 adapter seq. "--pcr2": adapter_2, # --adapter2 (AdapterRemoval v2) expects the regular sequence "--adapter2": sequences.reverse_complement(adapter_2) } def check_options(options, results): for key, value in tests.iteritems(): if options.get(key) == value: results[key] = True results = dict.fromkeys(tests, False) for (_, _, _, _, record) in _iterate_over_records(makefile): adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) if any(results.itervalues()): print_warn("WARNING: An adapter specified for AdapterRemoval " "corresponds to the default sequence, but is reverse " "complemented. Please make sure that this is intended! ", end="") if results["--pcr2"]: print_warn("For --pcr2, the sequence given should be the " "reverse complement of the sequence observed in the " "mate 2 FASTQ file.\n") if results["--adapter2"]: print_warn("For --adapter2 (AdapterRemoval v2, only) the value " "should be exactly as observed in the FASTQ reads.\n")
def build_regions(options, genotype, beds, reverse_compl): for bed in beds: sequence = build_region(options, genotype, bed) if reverse_compl: sequence = sequences.reverse_complement(sequence) yield sequence
def test_reverse_complement(): assert_equal(reverse_complement(_REF_SRC), _REF_DST[::-1])