def __init__(self, f, fname, default_sample_id): TextParser.__init__(self, f, fname, default_sample_id) self.__format = None # Metadata and comments line = self._readline() while len(line) > 0 and line.startswith("#"): if line.startswith("##fileformat="): self.__format = line[13:] line = self._readline() if len(line) > 0: # Header column_indices = {} columns = line.rstrip().split("\t") self._col_size = len(columns) for i, name in enumerate(columns): name = name.lower() if name in _COLUMNS: column_indices[name] = i try: self._col_name_indices = column_indices self._col_indices = [column_indices[name] for name in _COLUMNS] except KeyError as ex: raise ParserException("Header column not found: {0}".format(ex.args[0]), self._location()) else: raise ParserException("Header not found", (self._fname))
def __init__(self, f, fname, default_sample_id): TextParser.__init__(self, f, fname, default_sample_id) self.__format = None # Metadata and comments line = self._readline() while len(line) > 0 and line.startswith("#"): if line.startswith("##INDIVIDUAL="): self._default_sample_id = line[13:] elif line.startswith("##fileformat="): self.__format = line[13:] line = self._readline() if len(line) > 0: # First line self._queue_line(line)
def next(self): TextParser.next(self) var = None while var is None: line = self._readline() if len(line) == 0: raise StopIteration() fields = line.rstrip("\n").split("\t") chr, start, strand, vtype, ref, alt1, alt2, sample = [ fields[i] if i < self._col_size else None for i in self._col_indices] #print ">>>", chr, start, strand, vtype, ref, alt1, alt2, sample # Chromosome chr = parse_chromosome(chr) if chr is None: self._discard_line() continue # Start try: start = int(start) except: self._discard_line() continue # Strand if len(strand) == 0 or strand == "1" or strand == "+1": strand = "+" elif strand == "-1": strand = "-" elif strand not in ["+", "-"]: self._discard_line() continue # Ref & alt if ref is None or alt1 is None or alt2 is None: self._discard_line() continue try: for i, x in enumerate([ref, alt1, alt2]): if _ALLELE_RE.match(x) is None: self._discard_line() raise SkipLine() except SkipLine: continue alt = alt1 if ref == "-": # [1 2] --> [1 2] 3 # . - . N # . T . N T ref = "N" alt = "N" + alt if alt != "-" else "N" elif alt == "-": # 1 [2] 3 --> [1 2] 3 # . T . N T # . - . N start -= 1 ref = "N" + ref alt = "N" ref_len = len(ref) vtype = Variant.SUBST if ref_len == len(alt) else Variant.INDEL if len(sample) == 0: sample = self._default_sample_id if alt1 != alt2: fields[self._col_name_indices[_COL_ALLELE1]] = fields[self._col_name_indices[_COL_ALLELE2]] self._queue_line("\t".join(fields)) if ref == alt: continue var = Variant(type=vtype, chr=chr, start=start, ref=ref, alt=alt, strand=strand, samples=[Sample(name=sample)]) #from intogensm.variants.utils import var_to_tab #print "***", var #print "+++", var_to_tab(var) return var
def next(self): TextParser.next(self) var = None while var is None: fields = self.__read_fields() if len(fields) < 5: self._discard_line() continue if len(fields) < 6: fields += [self._default_sample_id] chr, start, end, strand, allele, sample = fields[0:6] #print ">>>", chr, start, end, strand, allele, sample # Chromosome chr = parse_chromosome(chr) if chr is None: self._discard_line() continue # Start and end try: start = int(start) except: self._discard_line() continue try: end = int(end) except: self._discard_line() continue if start > end: start, end = end, start # Strand if len(strand) == 0 or strand == "1" or strand == "+1": strand = "+" elif strand == "-1": strand = "-" elif strand not in ["+", "-"]: self._discard_line() continue # Alleles alleles = allele.split(">") if len(alleles) != 2: self._discard_line() continue ref, alt = alleles # Check that are well formed try: for a in [ref, alt]: if _ALLELE_RE.match(a) is None: self._discard_line() raise SkipLine() except SkipLine: continue # Special cases if ref == "-" or alt == "-": # ->A, GCT>- if ref == "-": # [1 2] --> [1 2] 3 # . - . N # . T . N T ref = "N" alt = "N" + alt if alt != "-" else "N" elif alt == "-": # 1 [2] 3 --> [1 2] 3 # . T . N T # . - . N start -= 1 ref = "N" + ref alt = "N" elif ref == "*" and len(alt) > 1 and alt[0] in ["-", "+"]: # *>-ACG, *>+CG if alt[0] == "-": start -= 1 ref = "N" + alt[1:] alt = "N" elif alt[0] == "+": ref = "N" alt = "N" + alt[1:] elif "/" in ref or "/" in alt: # A/A>-/GGT, C/C>A/T, C/C>G/G ref = ref.split("/") alt = alt.split("/") if len(ref) != 2 or len(ref) != len(alt): self._discard_line() continue if ref[0] == ref[1] and alt[0] == alt[1]: ref.pop() alt.pop() for i in range(len(ref)): allele = "{0}>{1}".format(ref[i], alt[i]) self._queue_line("\t".join([chr, str(start), str(end), strand, allele, sample])) continue ref_len = len(ref) alt_len = len(alt) vtype = Variant.SUBST if ref_len == alt_len else Variant.INDEL # Sample if len(sample) == 0: sample = self._default_sample_id var = Variant(type=vtype, chr=chr, start=start, ref=ref, alt=alt, strand=strand, samples=[Sample(name=sample)]) #from intogensm.variants.utils import var_to_tab #print "***", var #print "+++", var_to_tab(var) return var
def __init__(self, f, fname, default_sample_id): TextParser.__init__(self, f, fname, default_sample_id)
def next(self): TextParser.next(self) var = None while var is None: line = self._readline() while len(line) > 0 and line.lstrip().startswith("#"): if line.startswith("##INDIVIDUAL="): self._default_sample_id = line[13:] line = self._readline() if len(line) == 0: raise StopIteration() fields = line.rstrip("\n").split("\t") if len(fields) < 5: self._discard_line() continue chr, start, external_id, ref, alt = fields[0:5] # Chromosome chr = parse_chromosome(chr) if chr is None: self._discard_line() continue # Start try: start = int(start) except: self._discard_line() continue # Check ref and alt if _REF_RE.match(ref) is None: self._discard_line() continue if _ALT_RE.match(alt) is None: self._discard_line() continue ref_len = len(ref) if "," in alt: vtype = None s = alt.split(",") for allele in s: if ref_len == len(allele): t = Variant.SUBST else: t = Variant.INDEL if vtype is None: vtype = t elif vtype != t: vtype = Variant.COMPLEX break else: if ref_len == len(alt): vtype = Variant.SUBST else: vtype = Variant.INDEL var = Variant(type=vtype, chr=chr, start=start, ref=ref, alt=alt, strand="+", samples=[Sample(name=self._default_sample_id)]) return var