def __init__(self, filename, region=None): self.filename = filename self.region = False self.region_str = region self.region_has_pos = False self.been_in_region = False self.lg = lg.getLogger('PSVCF') if region: self.region = True _x = self.region_str.split(':') self.region_seq = _x[0] if len(_x) > 1: self.region_has_pos = True _y = _x[1].split('-') self.region_start = int(_y[0]) self.region_end = int(_y[1]) self.file_mode = 'file' if filename == '-': self.file_mode = 'stdin' self.F = Peekorator(sys.stdin) elif filename[-3:] == '.gz' and os.path.exists(filename + '.tbi') \ and self.region: self.lg.debug('tabix mode') self.file_mode = 'tabix' #TABIX mode! cl = 'tabix -h %s %s' % (filename, self.region_str) self.lg.debug("running tabis: %s" % cl) self.TABIX_PROCESS = sp.Popen(cl.split(), stdout=sp.PIPE) self.F = Peekorator(self.TABIX_PROCESS.stdout) elif filename[-3:] == '.gz': self.lg.debug('Opening as bz2') self.F = Peekorator(gzip.open(self.filename)) elif filename[-4:] == '.bz2': self.lg.debug('Opening as bz2') self.F = Peekorator(bz2.BZ2File(self.filename)) else: self.lg.debug('normal file mode') self.F = Peekorator(open(self.filename)) self.meta_header_lines = [] self.header_line = "" #read header while True: line = self.F.peek if not line: raise StopIteration elif not line.strip(): self.F.next() pass elif line[:2] == '##': self.F.next() self.meta_header_lines.append(line.strip()) elif line[:6] == '#CHROM': #header: self.interpret_header(line) break elif line[0] != '#': #bummer - no header?? self.create_dummy_header() break self.lg.info('Opening %s' % self.filename)
class PSVCF(object): def __init__(self, filename, region=None): self.filename = filename self.region = False self.region_str = region self.region_has_pos = False self.been_in_region = False self.lg = lg.getLogger('PSVCF') if region: self.region = True _x = self.region_str.split(':') self.region_seq = _x[0] if len(_x) > 1: self.region_has_pos = True _y = _x[1].split('-') self.region_start = int(_y[0]) self.region_end = int(_y[1]) self.file_mode = 'file' if filename == '-': self.file_mode = 'stdin' self.F = Peekorator(sys.stdin) elif filename[-3:] == '.gz' and os.path.exists(filename + '.tbi') \ and self.region: self.lg.debug('tabix mode') self.file_mode = 'tabix' #TABIX mode! cl = 'tabix -h %s %s' % (filename, self.region_str) self.lg.debug("running tabis: %s" % cl) self.TABIX_PROCESS = sp.Popen(cl.split(), stdout=sp.PIPE) self.F = Peekorator(self.TABIX_PROCESS.stdout) elif filename[-3:] == '.gz': self.lg.debug('Opening as bz2') self.F = Peekorator(gzip.open(self.filename)) elif filename[-4:] == '.bz2': self.lg.debug('Opening as bz2') self.F = Peekorator(bz2.BZ2File(self.filename)) else: self.lg.debug('normal file mode') self.F = Peekorator(open(self.filename)) self.meta_header_lines = [] self.header_line = "" #read header while True: line = self.F.peek if not line: raise StopIteration elif not line.strip(): self.F.next() pass elif line[:2] == '##': self.F.next() self.meta_header_lines.append(line.strip()) elif line[:6] == '#CHROM': #header: self.interpret_header(line) break elif line[0] != '#': #bummer - no header?? self.create_dummy_header() break self.lg.info('Opening %s' % self.filename) def __iter__(self): return self def create_dummy_header(self): self.header_line = "\t".join( ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'sample']) ls = self.header_line.split("\t") self.sample_names = ls[9:] def interpret_header(self, line): self.header_line = line.strip() ls = self.header_line.split("\t") assert(ls[:9] == ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']) self.sample_names = ls[9:] def simple_names(self): """ Return simplified sample names """ def _simple_name(s): s = os.path.basename(s) s = s.replace('.vcf', '') s = s.replace('.bam', '') s = s.replace('.bz2', '') return s return [_simple_name(x) for x in self.sample_names] def next(self): while True: line = self.F.next() if not line: #EOF raise StopIteration line = line.strip() if not line: #empty line continue if line[:2] == '##': self.meta_header_lines.append(line.strip()) continue if line[:6] == '#CHROM': #header: self.interpret_header(line) continue try: loc = Locus(line, sample_names = self.sample_names) except SampleParseError: lg.critical("could not parse line") lg.critical(line) raise SampleParseError() if not self.region: #No region is specified - return regardless return loc #figure out if this locus falls within the specified region if loc.seq != self.region_seq: if self.been_in_region: self.fin() if not self.region_has_pos: self.been_in_region = True return loc if loc.pos < self.region_start: continue if loc.pos > self.region_end: raise StopIteration self.been_in_region = True return loc def fin(): """ Finish iterations """ if self.file_mode == 'file': self.F.close() elif self.file_mode == 'stdin': pass elif self.file_mode == 'tabix': self.F.close() raise StopIteration def add_meta(self, k, v): """ Set a meta key/value pair - will be added to the header """ self.meta_header_lines.append("##%s=%s" % (k,v)) def build_header(self): """ Return a new header """ rv = self.meta_header_lines rv.append(self.header_line) return "\n".join(rv) + "\n"