import re import pandas as pd from bioframes import to_np_int, sanger_qual_str_to_error from itertools import groupby from func import pmap, psplit, pstrip, compose, compose_all, merge_dicts, fzip, partial2, dictmap, starcompose from operator import itemgetter from functools import partial import operator as op from schema import Schema, Use from itertools import ifilter # Parse options #from pyparsing import Regex parse_array = compose_all(to_np_int, psplit(','), pstrip('[]')) tabsplit = psplit('\t') basic_scheme={ 'QNAME' : str, 'FLAG' : int, 'RNAME' : str, 'POS' : int, 'MAPQ' : int, 'CIGAR' : str, 'RNEXT' : str, 'PNEXT' : int, 'TLEN' : int, #'MRNM' : str, #'MRNM' : '*='.__contains__, #'MPOS' : int, #'ISIZE' : int, 'SEQ' : str,
from pyparsing import Regex #_join = partial(reduce, lambda a, b: a+':'+b) tag = Regex(r'[A-Za-z][A-Za-z0-9]') _type = Regex(r'[AifZHB]') value = Regex('[^\t]') full = tag + ':' + _type + ':' + value #reduce(operator.add, [tag, _type, value], ':') #cigar_regex = r'\*|([0-9]+[MIDNSHPX=])+' #? makes the regex not be too greedy full.parseString('AS:i:213') #full = _join( [tag, _type, value ] ) #compose3 = partial(reduce, compose) #parse_array = re.compile(r'[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+').match #parse_array = compose_all(pmap(int), psplit(','), pstrip('[]')) #re.compile(r'[^\[\]]').match) parse_array = compose_all(to_np_int, psplit(','), pstrip('[]')) #re.compile(r'[^\[\]]').match) #m = re.compile(r'[^\[\]]+').match #TODO: ASCII of Phred-scaled base QUALity+33 ''' NOTE: samfiles use ASCII of Phred-scaled base QUALity+33 ''' #qual_int = ord { 'A' : chr, 'i' : int, 'f' : float, 'Z' : str, 'H' : int, # hex 'B' : parse_array } #parse cigar string