コード例 #1
0
ファイル: validation.py プロジェクト: MikkelSchubert/paleomix
def _process_bam_reads(observed_reads, references, position, err_func):
    for records_and_filenames in observed_reads.itervalues():
        if len(records_and_filenames) == 1:
            # Most read-names should be obseved at most once at a position
            continue

        result = collections.defaultdict(list)
        for record, filename in records_and_filenames:
            key = (record.is_reverse, record.qname, record.seq, record.qual)
            result[key].append((filename, record))

        for (is_reverse, name, seq, qual), filenames in result.iteritems():
            if len(filenames) == 1:
                # Two reads had same name, but different characterstics
                continue

            records = collections.defaultdict(list)
            for filename, record in filenames:
                records[filename].append(record)

            if is_reverse:
                seq = reverse_complement(seq)
                qual = qual[::-1]

            chrom = references[position[1]]
            pos = position[0]

            err_func(chrom, pos, records, name, seq, qual)
コード例 #2
0
ファイル: validation.py プロジェクト: tmancill/paleomix
def _process_bam_reads(observed_reads, references, position, err_func):
    for records_and_filenames in observed_reads.values():
        if len(records_and_filenames) == 1:
            # Most read-names should be obseved at most once at a position
            continue

        result = collections.defaultdict(list)
        for record, filename in records_and_filenames:
            key = (record.is_reverse, record.qname, record.seq, record.qual)
            result[key].append((filename, record))

        for (is_reverse, name, seq, qual), filenames in result.items():
            if len(filenames) == 1:
                # Two reads had same name, but different characterstics
                continue

            records = collections.defaultdict(list)
            for filename, record in filenames:
                records[filename].append(record)

            if is_reverse:
                seq = reverse_complement(seq)
                qual = qual[::-1]

            chrom = references[position[1]]
            pos = position[0]

            err_func(chrom, pos, records, name, seq, qual)
コード例 #3
0
ファイル: remap.py プロジェクト: muslih14/paleomix
    def write_records(self, records):
        for record in records:
            seq = record.seq
            qual = record.qual
            if record.is_reverse:
                seq = reverse_complement(seq)
                qual = qual[::-1]

            assert len(qual) == len(seq), record.qname
            self._handle.write("@%s\n" % (record.qname,))
            self._handle.write("%s\n" % (seq,))
            self._handle.write("+\n")
            self._handle.write("%s\n" % (qual,))
コード例 #4
0
ファイル: remap.py プロジェクト: jelber2/paleomix
    def write_records(self, records):
        for record in records:
            seq = record.seq
            qual = record.qual
            if record.is_reverse:
                seq = reverse_complement(seq)
                qual = qual[::-1]

            assert len(qual) == len(seq), record.qname
            self._handle.write("@%s\n" % (record.qname,))
            self._handle.write("%s\n" % (seq,))
            self._handle.write("+\n")
            self._handle.write("%s\n" % (qual,))
コード例 #5
0
ファイル: synthesize_reads.py プロジェクト: jelber2/paleomix
 def _generate_reads(cls, options, rng, sample, minimum, pcr1):
     reads = []
     while len(reads) < minimum:
         name, sequence = sample.get_fragment()
         cur_forward = sequence + pcr1
         cur_reverse = reverse_complement(sequence) + PCR2
         # Number of PCR copies -- minimum 1
         num_dupes = toint(_rexp(options.library_pcr_lambda, rng)) + 1
         for dupe_id in xrange(num_dupes):
             cur_name = "%s_%s" % (name, dupe_id)
             reads.append((cur_name, cur_forward, cur_reverse))
     random.shuffle(reads)
     return reads
コード例 #6
0
ファイル: synthesize_reads.py プロジェクト: muslih14/paleomix
 def _generate_reads(cls, options, rng, sample, minimum, pcr1):
     reads = []
     while len(reads) < minimum:
         name, sequence = sample.get_fragment()
         cur_forward = sequence + pcr1
         cur_reverse = reverse_complement(sequence) + PCR2
         # Number of PCR copies -- minimum 1
         num_dupes = toint(_rexp(options.library_pcr_lambda, rng)) + 1
         for dupe_id in xrange(num_dupes):
             cur_name = "%s_%s" % (name, dupe_id)
             reads.append((cur_name, cur_forward, cur_reverse))
     random.shuffle(reads)
     return reads
コード例 #7
0
ファイル: synthesize_reads.py プロジェクト: muslih14/paleomix
    def _get_endogenous_sequence(self):
        length = self._get_frag_len()
        max_position = len(self._specimen.sequence) - length
        position = self._random.randint(0, max_position)
        strand = self._random.choice(("fw", "rv"))

        sequence = self._specimen.sequence[position:position + length]
        real_pos = self._specimen.positions[position]
        if strand == "rv":
            sequence = reverse_complement("".join(sequence))

        self._endog_id += 1
        name = "Seq_%i_%i_%i_%s" % (self._endog_id, real_pos, length, strand)
        return (True, name, sequence)
コード例 #8
0
ファイル: synthesize_reads.py プロジェクト: jelber2/paleomix
    def _get_endogenous_sequence(self):
        length = self._get_frag_len()
        max_position = len(self._specimen.sequence) - length
        position = self._random.randint(0, max_position)
        strand = self._random.choice(("fw", "rv"))

        sequence = self._specimen.sequence[position:position + length]
        real_pos = self._specimen.positions[position]
        if strand == "rv":
            sequence = reverse_complement("".join(sequence))

        self._endog_id += 1
        name = "Seq_%i_%i_%i_%s" % (self._endog_id, real_pos, length, strand)
        return (True, name, sequence)
コード例 #9
0
ファイル: validation.py プロジェクト: muslih14/paleomix
    def _process_reads(cls, observed_reads, output_files):
        for records_and_filenames in observed_reads.itervalues():
            if len(records_and_filenames) == 1:
                # Most read-names should be obseved at most once at a position
                continue

            result = collections.defaultdict(list)
            for record, filename in records_and_filenames:
                key = (record.is_reverse, record.qname, record.seq, record.qual)
                result[key].append(filename)

            for (is_reverse, name, seq, qual), filenames in result.iteritems():
                if len(filenames) == 1:
                    # Two reads had same name, but different characterstics
                    continue

                filename_counts = collections.defaultdict(int)
                for filename in filenames:
                    filename_counts[filename] += 1

                if is_reverse:
                    seq = reverse_complement(seq)
                    qual = qual[::-1]

                message = ["The same read was found multiple times!",
                           "    Name:      %r" % (name,),
                           "    Sequence:  %r" % (seq,),
                           "    Qualities: %r" % (qual,),
                           ""]

                message.append("Read was found")
                for filename, count in sorted(filename_counts.iteritems()):
                    message.append("   % 2ix in %r" % (count, filename))

                message.append("")
                message.append("This indicates that the same data files have "
                               "been included multiple times in the project. "
                               "Please review the input files used in this "
                               "project, to ensure that each set of data is "
                               "included only once!\n\n"

                               "If this is not the case, then execute the "
                               "following command(s) to mark this test as "
                               "having succeeded:")

                for fpath in output_files:
                    message.append("$ touch '%s'" % (fpath,))

                raise NodeError("\n".join(message))
コード例 #10
0
ファイル: sequences.py プロジェクト: jelber2/paleomix
    def _collect_sequence(cls, fastafile, beds):
        sequence = []
        for bed in beds:
            fragment = fastafile.fetch(bed.contig, bed.start, bed.end)
            if len(fragment) != (bed.end - bed.start):
                cls._report_failure(bed, fragment)

            sequence.append(fragment)
        sequence = "".join(sequence)

        if any((bed.strand == "-") for bed in beds):
            assert all((bed.strand == "-") for bed in beds)
            sequence = sequtils.reverse_complement(sequence)

        return sequence
コード例 #11
0
ファイル: sequences.py プロジェクト: MikkelSchubert/paleomix
    def _collect_sequence(cls, fastafile, beds):
        sequence = []
        for bed in beds:
            fragment = fastafile.fetch(bed.contig, bed.start, bed.end)
            if len(fragment) != (bed.end - bed.start):
                cls._report_failure(bed, fragment)

            sequence.append(fragment)
        sequence = "".join(sequence)

        if any((bed.strand == "-") for bed in beds):
            assert all((bed.strand == "-") for bed in beds)
            sequence = sequtils.reverse_complement(sequence)

        return sequence
コード例 #12
0
def build_genes(options, genotype, regions):
    def keyfunc(bed):
        return (bed.contig, bed.name, bed.start)
    regions.sort(key=keyfunc)

    for (gene, beds) in itertools.groupby(regions, lambda x: x.name):
        sequence, beds = [], tuple(beds)
        for bed in beds:
            sequence.extend(build_region(options, genotype, bed))
        sequence = "".join(sequence)

        if any((bed.strand == "-") for bed in beds):
            assert all((bed.strand == "-") for bed in beds)

            sequence = sequences.reverse_complement(sequence)

        yield (gene, sequence)
コード例 #13
0
ファイル: sample_pileup.py プロジェクト: jelber2/paleomix
def build_genes(options, genotype, regions):
    def keyfunc(bed):
        return (bed.contig, bed.name, bed.start)

    regions.sort(key=keyfunc)

    for (gene, beds) in itertools.groupby(regions, lambda x: x.name):
        sequence, beds = [], tuple(beds)
        for bed in beds:
            sequence.extend(build_region(options, genotype, bed))
        sequence = "".join(sequence)

        if any((bed.strand == "-") for bed in beds):
            assert all((bed.strand == "-") for bed in beds)

            sequence = sequences.reverse_complement(sequence)

        yield (gene, sequence)
コード例 #14
0
ファイル: makefile.py プロジェクト: jelber2/paleomix
def _validate_makefile_adapters(makefile):
    """Checks for the default adapter sequences specified in the wrong
    orientation for AdapterRemoval, which is a typical mistake when using
    the --pcr2 option.
    """
    # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads
    adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"

    tests = {
        # --pcr2 expects the reverse complement of the mate 2 adapter seq.
        "--pcr2": adapter_2,
        # --adapter2 (AdapterRemoval v2) expects the regular sequence
        "--adapter2": sequences.reverse_complement(adapter_2)
    }

    def check_options(options, results):
        for key, value in tests.iteritems():
            if options.get(key) == value:
                results[key] = True

    results = dict.fromkeys(tests, False)
    for (_, _, _, _, record) in _iterate_over_records(makefile):
        adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {})
        check_options(adapterrm_opt, results)

    adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {})
    check_options(adapterrm_opt, results)

    if any(results.itervalues()):
        print_warn(
            "WARNING: An adapter specified for AdapterRemoval "
            "corresponds to the default sequence, but is reverse "
            "complemented. Please make sure that this is intended! ",
            end="")

        if results["--pcr2"]:
            print_warn("For --pcr2, the sequence given should be the "
                       "reverse complement of the sequence observed in the "
                       "mate 2 FASTQ file.\n")

        if results["--adapter2"]:
            print_warn("For --adapter2 (AdapterRemoval v2, only) the value "
                       "should be exactly as observed in the FASTQ reads.\n")
コード例 #15
0
ファイル: makefile.py プロジェクト: MikkelSchubert/paleomix
def _validate_makefile_adapters(makefile):
    """Checks for the default adapter sequences specified in the wrong
    orientation for AdapterRemoval, which is a typical mistake when using
    the --pcr2 option.
    """
    # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads
    adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"

    tests = {
        # --pcr2 expects the reverse complement of the mate 2 adapter seq.
        "--pcr2": adapter_2,
        # --adapter2 (AdapterRemoval v2) expects the regular sequence
        "--adapter2": sequences.reverse_complement(adapter_2)
    }

    def check_options(options, results):
        for key, value in tests.iteritems():
            if options.get(key) == value:
                results[key] = True

    results = dict.fromkeys(tests, False)
    for (_, _, _, _, record) in _iterate_over_records(makefile):
        adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {})
        check_options(adapterrm_opt, results)

    adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {})
    check_options(adapterrm_opt, results)

    if any(results.itervalues()):
        print_warn("WARNING: An adapter specified for AdapterRemoval "
                   "corresponds to the default sequence, but is reverse "
                   "complemented. Please make sure that this is intended! ",
                   end="")

        if results["--pcr2"]:
            print_warn("For --pcr2, the sequence given should be the "
                       "reverse complement of the sequence observed in the "
                       "mate 2 FASTQ file.\n")

        if results["--adapter2"]:
            print_warn("For --adapter2 (AdapterRemoval v2, only) the value "
                       "should be exactly as observed in the FASTQ reads.\n")
コード例 #16
0
def build_regions(options, genotype, beds, reverse_compl):
    for bed in beds:
        sequence = build_region(options, genotype, bed)
        if reverse_compl:
            sequence = sequences.reverse_complement(sequence)
        yield sequence
コード例 #17
0
def test_reverse_complement():
    assert_equal(reverse_complement(_REF_SRC), _REF_DST[::-1])
コード例 #18
0
ファイル: sequences_test.py プロジェクト: tmancill/paleomix
def test_reverse_complement():
    assert reverse_complement(_REF_SRC) == _REF_DST[::-1]
コード例 #19
0
ファイル: sequences_test.py プロジェクト: muslih14/paleomix
def test_reverse_complement():
    assert_equal(reverse_complement(_REF_SRC), _REF_DST[::-1])
コード例 #20
0
ファイル: vcf_to_fasta.py プロジェクト: tmancill/paleomix
def build_regions(options, genotype, beds, reverse_compl):
    for bed in beds:
        sequence = build_region(options, genotype, bed)
        if reverse_compl:
            sequence = sequences.reverse_complement(sequence)
        yield sequence