def main(argv):
    prog = "paleomix sample_pileup"
    usage = "%s [options] --genotype in.vcf --intervals in.bed > out.fasta" \
        % (prog,)

    parser = argparse.ArgumentParser(prog=prog, usage=usage)
    parser.add_argument("--genotype", help="Tabix indexed pileup file.",
                        required=True, metavar="PILEUP")
    parser.add_argument("--intervals", help="BED file.", required=True,
                        metavar="BED")
    parser.add_argument("--padding", type=int, default=10, metavar="BED",
                        help="Number of bases to expand intervals, when "
                             "filtering based on adjacent indels "
                             "[%(default)s]")
    parser.add_argument("--min-distance-to-indels", type=int, default=5,
                        help="Variants closer than this distance from indels "
                             "are filtered; set to a negative value to "
                             "disable [%(default)s].")
    args = parser.parse_args(argv)

    genotype = pysam.Tabixfile(args.genotype)
    with open(args.intervals) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, BEDRecord)

    for (_, beds) in sorted(intervals.items()):
        for (name, sequence) in build_genes(args, genotype, beds):
            FASTA(name, None, sequence).write(sys.stdout)

    return 0
Beispiel #2
0
def test_parse_lines_by_contig__single_contig():
    lines = ["abc line1 \n", "abc line2 \n"]

    def _parse(line, length):
        assert len(line) == length
        return _RecordMock(*line.split())

    expected = {"abc": [_RecordMock("abc", "line1"), _RecordMock("abc", "line2")]}
    assert parse_lines_by_contig(lines, _parse) == expected
Beispiel #3
0
def test_parse_lines__two_contigs():
    lines = ["abc line1 \n", "def line2 \n"]

    def _parse(line, length):
        assert_equal(len(line), length)
        return _RecordMock(*line.split())

    expected = {"abc": [_RecordMock("abc", "line1")],
                "def": [_RecordMock("def", "line2")]}
    assert_equal(parse_lines_by_contig(lines, _parse), expected)
Beispiel #4
0
def test_parse_lines__two_contigs():
    lines = ["abc line1 \n", "def line2 \n"]

    def _parse(line, length):
        assert len(line) == length
        return _RecordMock(*line.split())

    expected = {
        "abc": [_RecordMock("abc", "line1")],
        "def": [_RecordMock("def", "line2")],
    }
    assert parse_lines_by_contig(lines, _parse) == expected
Beispiel #5
0
def read_intervals(filename):
    with open(filename) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, BEDRecord)

        for (key, beds) in intervals.items():
            bed_tuples = []
            for bed in beds:
                if len(bed) < 6:
                    sys.stderr.write(("ERROR: Invalid BED record '%r', must "
                                      "have at least 6 fields\n") % (bed, ))
                    return None

                bed_tuples.append(bed)
            intervals[key] = bed_tuples

    return intervals
def read_intervals(filename):
    with open(filename) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, BEDRecord)

        for (key, beds) in intervals.iteritems():
            bed_tuples = []
            for bed in beds:
                if len(bed) < 6:
                    sys.stderr.write(("ERROR: Invalid BED record '%r', must "
                                      "have at least 6 fields ...\n") % (bed,))
                    return None

                bed_tuples.append(bed)
            intervals[key] = bed_tuples

    return intervals
Beispiel #7
0
    def _run(self, _config, temp):
        def _by_name(bed):
            return bed.name

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._bedfile) as bedfile:
            bedrecords = text.parse_lines_by_contig(bedfile, BEDRecord)
            for (contig, beds) in sorted(bedrecords.iteritems()):
                beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start))

                for (gene, gene_beds) in itertools.groupby(beds, _by_name):
                    gene_beds = tuple(gene_beds)
                    sequence = self._collect_sequence(fastafile, gene_beds)
                    seqs[(contig, gene)] = sequence

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                FASTA(gene, None, sequence).write(out_file)

        fileutils.move_file(temp_file, self._outfile)
Beispiel #8
0
    def _run(self, _config, temp):
        def _by_name(bed):
            return bed.name

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._bedfile) as bedfile:
            bedrecords = text.parse_lines_by_contig(bedfile, BEDRecord)
            for (contig, beds) in sorted(bedrecords.iteritems()):
                beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start))

                for (gene, gene_beds) in itertools.groupby(beds, _by_name):
                    gene_beds = tuple(gene_beds)
                    sequence = self._collect_sequence(fastafile, gene_beds)
                    seqs[(contig, gene)] = sequence

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                FASTA(gene, None, sequence).write(out_file)

        fileutils.move_file(temp_file, self._outfile)
Beispiel #9
0
def main(argv):
    prog = "paleomix sample_pileup"
    usage = "%s [options] --genotype in.vcf --intervals in.bed > out.fasta" \
        % (prog,)

    parser = argparse.ArgumentParser(prog=prog, usage=usage)
    parser.add_argument("--genotype",
                        help="Tabix indexed pileup file.",
                        required=True,
                        metavar="PILEUP")
    parser.add_argument("--intervals",
                        help="BED file.",
                        required=True,
                        metavar="BED")
    parser.add_argument("--padding",
                        type=int,
                        default=10,
                        metavar="BED",
                        help="Number of bases to expand intervals, when "
                        "filtering based on adjacent indels "
                        "[%(default)s]")
    parser.add_argument("--min-distance-to-indels",
                        type=int,
                        default=5,
                        help="Variants closer than this distance from indels "
                        "are filtered; set to a negative value to "
                        "disable [%(default)s].")
    args = parser.parse_args(argv)

    genotype = pysam.Tabixfile(args.genotype)
    with open(args.intervals) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, BEDRecord)

    for (_, beds) in sorted(intervals.items()):
        for (name, sequence) in build_genes(args, genotype, beds):
            FASTA(name, None, sequence).write(sys.stdout)

    return 0