def _run(self, _config, temp): def keyfunc(bed): return (bed.contig, bed.name, bed.start) fastafile = pysam.Fastafile(self._reference) seqs = collections.defaultdict(list) with open(self._intervals) as bedfile: intervals = text.parse_lines_by_contig(bedfile, pysam.asBed()).items() for (contig, beds) in sorted(intervals): beds.sort(key = keyfunc) for (gene, gene_beds) in itertools.groupby(beds, lambda x: x.name): gene_beds = tuple(gene_beds) for bed in gene_beds: seqs[(contig, gene)].append(fastafile.fetch(contig, bed.start, bed.end)) seq = "".join(seqs[(contig, gene)]) if any((bed.strand == "-") for bed in gene_beds): assert all((bed.strand == "-") for bed in gene_beds) seq = sequences.reverse_complement(seq) seqs[(contig, gene)] = seq temp_file = os.path.join(temp, "sequences.fasta") with open(temp_file, "w") as out_file: for ((_, gene), sequence) in sorted(seqs.items()): fasta.print_fasta(gene, sequence, out_file) move_file(temp_file, self._outfile)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--genotype", help="Tabix indexed pileup file.", required=True) parser.add_argument("--intervals", help="BED file.", required=True) parser.add_argument("--padding", type=int, default=10, help="Number of bases to expand intervals, when " "filtering based on adjacent indels [%default]") parser.add_argument("--min-distance-to-indels", type=int, default=5, help="Variants closer than this distance from indels " "are filtered [%default].") args = parser.parse_args(argv) genotype = pysam.Tabixfile(args.genotype) with open(args.intervals) as bed_file: intervals = text.parse_lines_by_contig(bed_file, pysam.asBed()) for (_, beds) in sorted(intervals.items()): for (name, sequence) in build_genes(args, genotype, beds): FASTA(name, None, sequence).write(sys.stdout) return 0
def _run(self, _config, temp): def keyfunc(bed): return (bed.contig, bed.name, bed.start) fastafile = pysam.Fastafile(self._reference) seqs = collections.defaultdict(list) with open(self._intervals) as bedfile: intervals = text.parse_lines_by_contig(bedfile, pysam.asBed()).items() for (contig, beds) in sorted(intervals): beds.sort(key=keyfunc) for (gene, gene_beds) in itertools.groupby(beds, lambda x: x.name): gene_beds = tuple(gene_beds) for bed in gene_beds: seqs[(contig, gene)].append( fastafile.fetch(contig, bed.start, bed.end)) seq = "".join(seqs[(contig, gene)]) if any((bed.strand == "-") for bed in gene_beds): assert all((bed.strand == "-") for bed in gene_beds) seq = sequences.reverse_complement(seq) seqs[(contig, gene)] = seq temp_file = os.path.join(temp, "sequences.fasta") with open(temp_file, "w") as out_file: for ((_, gene), sequence) in sorted(seqs.items()): fasta.print_fasta(gene, sequence, out_file) move_file(temp_file, self._outfile)
def test_parse_lines__two_contigs(): lines = ["abc line1 \n", "def line2 \n"] def _parse(line, length): assert_equal(len(line), length) return _RecordMock(*line.split()) expected = {"abc" : [_RecordMock("abc", "line1")], "def" : [_RecordMock("def", "line2")]} assert_equal(parse_lines_by_contig(lines, _parse), expected)
def test_parse_lines__two_contigs(): lines = ["abc line1 \n", "def line2 \n"] def _parse(line, length): assert_equal(len(line), length) return _RecordMock(*line.split()) expected = { "abc": [_RecordMock("abc", "line1")], "def": [_RecordMock("def", "line2")] } assert_equal(parse_lines_by_contig(lines, _parse), expected)
def read_intervals(filename): with open(filename) as bed_file: intervals = text.parse_lines_by_contig(bed_file, BEDRecord) for (key, beds) in intervals.iteritems(): bed_tuples = [] for bed in beds: if len(bed) < 6: sys.stderr.write(("ERROR: Invalid BED record '%r', must " "have at least 6 fields ...\n") % (bed,)) return None bed_tuples.append(bed) intervals[key] = bed_tuples return intervals
def read_intervals(filename): with open(filename) as bed_file: intervals = text.parse_lines_by_contig(bed_file, BEDRecord) for (key, beds) in intervals.iteritems(): bed_tuples = [] for bed in beds: if len(bed) < 6: sys.stderr.write( ("ERROR: Invalid BED record '%r', must " "have at least 6 fields ...\n") % (bed, )) return None bed_tuples.append(bed) intervals[key] = bed_tuples return intervals
def _run(self, _config, temp): def _by_name(bed): return bed.name fastafile = pysam.Fastafile(self._reference) seqs = collections.defaultdict(list) with open(self._bedfile) as bedfile: bedrecords = text.parse_lines_by_contig(bedfile, BEDRecord) for (contig, beds) in sorted(bedrecords.iteritems()): beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start)) for (gene, gene_beds) in itertools.groupby(beds, _by_name): gene_beds = tuple(gene_beds) sequence = self._collect_sequence(fastafile, gene_beds) seqs[(contig, gene)] = sequence temp_file = os.path.join(temp, "sequences.fasta") with open(temp_file, "w") as out_file: for ((_, gene), sequence) in sorted(seqs.items()): FASTA(gene, None, sequence).write(out_file) fileutils.move_file(temp_file, self._outfile)
def _run(self, _config, temp): def _by_name(bed): return bed.name fastafile = pysam.Fastafile(self._reference) seqs = collections.defaultdict(list) with open(self._bedfile) as bedfile: bedrecords = text.parse_lines_by_contig(bedfile, pysam.asBed()) for (contig, beds) in sorted(bedrecords.iteritems()): beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start)) for (gene, gene_beds) in itertools.groupby(beds, _by_name): gene_beds = tuple(gene_beds) sequence = self._collect_sequence(fastafile, gene_beds) seqs[(contig, gene)] = sequence temp_file = os.path.join(temp, "sequences.fasta") with open(temp_file, "w") as out_file: for ((_, gene), sequence) in sorted(seqs.items()): FASTA(gene, None, sequence).write(out_file) fileutils.move_file(temp_file, self._outfile)
def read_intervals(filename): with open(filename) as bed_file: intervals = text.parse_lines_by_contig(bed_file, pysam.asBed()) for (key, beds) in intervals.iteritems(): bed_tuples = [] for bed in beds: if len(bed) < 6: sys.stderr.write(("ERROR: Invalid BED record '%s', must " "have at least 6 fields ...\n") % ("\\t".join(bed),)) return None # Transform to a named tuple, as Pysam has a tendency to # segfault if you do anything wrong bed = list(bed)[:6] # BED6 only bed[1] = int(bed[1]) # start bed[2] = int(bed[2]) # end bed[4] = int(bed[4]) # score bed_tuples.append(BEDTuple(*bed)) intervals[key] = bed_tuples return intervals