Ejemplo n.º 1
0
    def _run(self, _config, temp):
        def keyfunc(bed):
            return (bed.contig, bed.name, bed.start)

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._intervals) as bedfile:
            intervals = text.parse_lines_by_contig(bedfile, pysam.asBed()).items()
            for (contig, beds) in sorted(intervals):
                beds.sort(key = keyfunc)

                for (gene, gene_beds) in itertools.groupby(beds, lambda x: x.name):
                    gene_beds = tuple(gene_beds)
                    for bed in gene_beds:
                        seqs[(contig, gene)].append(fastafile.fetch(contig, bed.start, bed.end))

                    seq = "".join(seqs[(contig, gene)])
                    if any((bed.strand == "-") for bed in gene_beds):
                        assert all((bed.strand == "-") for bed in gene_beds)
                        seq = sequences.reverse_complement(seq)
                    seqs[(contig, gene)] = seq

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                fasta.print_fasta(gene, sequence, out_file)

        move_file(temp_file, self._outfile)
Ejemplo n.º 2
0
    def _run(self, _config, temp):
        def keyfunc(bed):
            return (bed.contig, bed.name, bed.start)

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._intervals) as bedfile:
            intervals = text.parse_lines_by_contig(bedfile,
                                                   pysam.asBed()).items()
            for (contig, beds) in sorted(intervals):
                beds.sort(key=keyfunc)

                for (gene,
                     gene_beds) in itertools.groupby(beds, lambda x: x.name):
                    gene_beds = tuple(gene_beds)
                    for bed in gene_beds:
                        seqs[(contig, gene)].append(
                            fastafile.fetch(contig, bed.start, bed.end))

                    seq = "".join(seqs[(contig, gene)])
                    if any((bed.strand == "-") for bed in gene_beds):
                        assert all((bed.strand == "-") for bed in gene_beds)
                        seq = sequences.reverse_complement(seq)
                    seqs[(contig, gene)] = seq

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                fasta.print_fasta(gene, sequence, out_file)

        move_file(temp_file, self._outfile)
Ejemplo n.º 3
0
 def _generate_reads(cls, options, rng, sample, minimum, pcr1):
     reads = []
     while len(reads) < minimum:
         name, sequence = sample.get_fragment()
         cur_forward = sequence + pcr1
         cur_reverse = reverse_complement(PCR2 + sequence)
         # Number of PCR copies -- minimum 1
         num_dupes = toint(_rexp(options.library_pcr_lambda, rng)) + 1
         for dupe_id in xrange(num_dupes):
             cur_name = "%s_%s" % (name, dupe_id)
             reads.append((cur_name, cur_forward, cur_reverse))
     random.shuffle(reads)
     return reads
Ejemplo n.º 4
0
 def _generate_reads(cls, options, rng, sample, minimum, pcr1):
     reads = []
     while len(reads) < minimum:
         name, sequence = sample.get_fragment()
         cur_forward = sequence + pcr1
         cur_reverse = reverse_complement(sequence) + PCR2
         # Number of PCR copies -- minimum 1
         num_dupes = toint(_rexp(options.library_pcr_lambda, rng)) + 1
         for dupe_id in xrange(num_dupes):
             cur_name = "%s_%s" % (name, dupe_id)
             reads.append((cur_name, cur_forward, cur_reverse))
     random.shuffle(reads)
     return reads
Ejemplo n.º 5
0
    def write_records(self, records):
        for record in records:
            seq = record.seq
            qual = record.qual
            if record.is_reverse:
                seq = reverse_complement(seq)
                qual = qual[::-1]

            assert len(qual) == len(seq), record.qname
            self._handle.write("@%s\n" % (record.qname,))
            self._handle.write("%s\n" % (seq,))
            self._handle.write("+\n")
            self._handle.write("%s\n" % (qual,))
Ejemplo n.º 6
0
    def write_records(self, records):
        for record in records:
            seq = record.seq
            qual = record.qual
            if record.is_reverse:
                seq = reverse_complement(seq)
                qual = qual[::-1]

            assert len(qual) == len(seq), record.qname
            self._handle.write("@%s\n" % (record.qname, ))
            self._handle.write("%s\n" % (seq, ))
            self._handle.write("+\n")
            self._handle.write("%s\n" % (qual, ))
Ejemplo n.º 7
0
    def _get_endogenous_sequence(self):
        length = self._get_frag_len()
        max_position = len(self._specimen.sequence) - length
        position = self._random.randint(0, max_position)
        strand = self._random.choice(("fw", "rv"))

        sequence = self._specimen.sequence[position:position + length]
        real_pos = self._specimen.positions[position]
        if strand == "rv":
            sequence = reverse_complement("".join(sequence))

        self._endog_id += 1
        name = "Seq_%i_%i_%i_%s" % (self._endog_id, real_pos, length, strand)
        return (True, name, sequence)
Ejemplo n.º 8
0
    def _get_endogenous_sequence(self):
        length = self._get_frag_len()
        max_position = len(self._specimen.sequence) - length
        position = self._random.randint(0, max_position)
        strand = self._random.choice(("fw", "rv"))

        sequence = self._specimen.sequence[position:position + length]
        real_pos = self._specimen.positions[position]
        if strand == "rv":
            sequence = reverse_complement("".join(sequence))

        self._endog_id += 1
        name = "Seq_%i_%i_%i_%s" % (self._endog_id, real_pos, length, strand)
        return (True, name, sequence)
Ejemplo n.º 9
0
    def _collect_sequence(cls, fastafile, beds):
        sequence = []
        for bed in beds:
            fragment = fastafile.fetch(bed.contig, bed.start, bed.end)
            if len(fragment) != (bed.end - bed.start):
                cls._report_failure(bed, fragment)

            sequence.append(fragment)
        sequence = "".join(sequence)

        if any((bed.strand == "-") for bed in beds):
            assert all((bed.strand == "-") for bed in beds)
            sequence = sequtils.reverse_complement(sequence)

        return sequence
Ejemplo n.º 10
0
    def _collect_sequence(cls, fastafile, beds):
        sequence = []
        for bed in beds:
            fragment = fastafile.fetch(bed.contig, bed.start, bed.end)
            if len(fragment) != (bed.end - bed.start):
                cls._report_failure(bed, fragment)

            sequence.append(fragment)
        sequence = "".join(sequence)

        if any((bed.strand == "-") for bed in beds):
            assert all((bed.strand == "-") for bed in beds)
            sequence = sequtils.reverse_complement(sequence)

        return sequence
Ejemplo n.º 11
0
def build_genes(options, genotype, regions):
    def keyfunc(bed):
        return (bed.contig, bed.name, bed.start)
    regions.sort(key=keyfunc)

    for (gene, beds) in itertools.groupby(regions, lambda x: x.name):
        sequence, beds = [], tuple(beds)
        for bed in beds:
            sequence.extend(build_region(options, genotype, bed))
        sequence = "".join(sequence)

        if any((bed.strand == "-") for bed in beds):
            assert all((bed.strand == "-") for bed in beds)

            sequence = sequences.reverse_complement(sequence)

        yield (gene, sequence)
Ejemplo n.º 12
0
def build_genes(options, genotype, regions):
    def keyfunc(bed):
        return (bed.contig, bed.name, bed.start)

    regions.sort(key=keyfunc)

    for (gene, beds) in itertools.groupby(regions, lambda x: x.name):
        sequence, beds = [], tuple(beds)
        for bed in beds:
            sequence.extend(build_region(options, genotype, bed))
        sequence = "".join(sequence)

        if any((bed.strand == "-") for bed in beds):
            assert all((bed.strand == "-") for bed in beds)

            sequence = sequences.reverse_complement(sequence)

        yield (gene, sequence)
Ejemplo n.º 13
0
def _validate_makefile_adapters(makefile):
    """Checks for the default adapter sequences specified in the wrong
    orientation for AdapterRemoval, which is a typical mistake when using
    the --pcr2 option.
    """
    # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads
    adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"

    tests = {
        # --pcr2 expects the reverse complement of the mate 2 adapter seq.
        "--pcr2": adapter_2,
        # --adapter2 (AdapterRemoval v2) expects the regular sequence
        "--adapter2": sequences.reverse_complement(adapter_2)
    }

    def check_options(options, results):
        for key, value in tests.iteritems():
            if options.get(key) == value:
                results[key] = True

    results = dict.fromkeys(tests, False)
    for (_, _, _, _, record) in _iterate_over_records(makefile):
        adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {})
        check_options(adapterrm_opt, results)

    adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {})
    check_options(adapterrm_opt, results)

    if any(results.itervalues()):
        print_warn(
            "WARNING: An adapter specified for AdapterRemoval "
            "corresponds to the default sequence, but is reverse "
            "complemented. Please make sure that this is intended! ",
            end="")

        if results["--pcr2"]:
            print_warn("For --pcr2, the sequence given should be the "
                       "reverse complement of the sequence observed in the "
                       "mate 2 FASTQ file.\n")

        if results["--adapter2"]:
            print_warn("For --adapter2 (AdapterRemoval v2, only) the value "
                       "should be exactly as observed in the FASTQ reads.\n")
Ejemplo n.º 14
0
def _validate_makefile_adapters(makefile):
    """Checks for the default adapter sequences specified in the wrong
    orientation for AdapterRemoval, which is a typical mistake when using
    the --pcr2 option.
    """
    # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads
    adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"

    tests = {
        # --pcr2 expects the reverse complement of the mate 2 adapter seq.
        "--pcr2": adapter_2,
        # --adapter2 (AdapterRemoval v2) expects the regular sequence
        "--adapter2": sequences.reverse_complement(adapter_2)
    }

    def check_options(options, results):
        for key, value in tests.iteritems():
            if options.get(key) == value:
                results[key] = True

    results = dict.fromkeys(tests, False)
    for (_, _, _, _, record) in _iterate_over_records(makefile):
        adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {})
        check_options(adapterrm_opt, results)

    adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {})
    check_options(adapterrm_opt, results)

    if any(results.itervalues()):
        print_warn("WARNING: An adapter specified for AdapterRemoval "
                   "corresponds to the default sequence, but is reverse "
                   "complemented. Please make sure that this is intended! ",
                   end="")

        if results["--pcr2"]:
            print_warn("For --pcr2, the sequence given should be the "
                       "reverse complement of the sequence observed in the "
                       "mate 2 FASTQ file.\n")

        if results["--adapter2"]:
            print_warn("For --adapter2 (AdapterRemoval v2, only) the value "
                       "should be exactly as observed in the FASTQ reads.\n")
Ejemplo n.º 15
0
def build_regions(options, genotype, beds, reverse_compl):
    for bed in beds:
        sequence = build_region(options, genotype, bed)
        if reverse_compl:
            sequence = sequences.reverse_complement(sequence)
        yield sequence
Ejemplo n.º 16
0
def test_reverse_complement():
    assert_equal(reverse_complement(_REF_SRC), _REF_DST[::-1])
Ejemplo n.º 17
0
def test_reverse_complement():
    assert_equal(reverse_complement(_REF_SRC), _REF_DST[::-1])
Ejemplo n.º 18
0
def build_regions(options, genotype, beds, reverse_compl):
    for bed in beds:
        sequence = build_region(options, genotype, bed)
        if reverse_compl:
            sequence = sequences.reverse_complement(sequence)
        yield sequence