def test1(n): from numpy import array # One equilibrium of Ar4 LJ cluster (in coordinates of # c2v_tetrahedron1 Func): w = 0.39685026 A = array([w, w, +w]) # Another equilibrium: B = array([w, w, -w]) # Halfway between A and B: C = (A + B) / 2.0 C = array([w + 0.01, w - 0.01, 0.0]) xs = array([A, C, B]) from test.testfuns import c2v_tetrahedron1, diagsandhight from path import MetricPath from metric import Metric from numpy import linspace z = c2v_tetrahedron1() # z = diagsandhight() # r = 1.12246195815 # A = array([r, r, r / sqrt(2.)]) # B = array([r, r, -r / sqrt(2.)]) # C = array([r, r * sqrt(2.), 0.]) # xs = array([A, C, B]) p = MetricPath(xs, Metric(z).norm_up) x0 = map(p, linspace(0., 1., n)) from ase import Atoms from qfunc import QFunc from func import compose pes = compose(QFunc(Atoms("Ar4")), z) from rc import Volume vol = compose(Volume(), z) def callback(x, e, g, t): # from pts.tools.jmol import jmol_view_path print "energies=", e # map(pes, x) print "volume=", map(vol, x) # jmol_view_path(map(z, x), syms=["Ar"]*4, refine=1) pass print "BEFORE:" callback(x0, map(pes, x0), map(pes.fprime, x0), None) x1, info = soptimize(pes, x0, tangent1, rc=vol, callback=callback) # print "info=", info print "AFTER:" callback(x1, map(pes, x1), map(pes.fprime, x1), None)
def fqframe(fileh): final_schema = Schema({ 'id': str, 'seq': str, 'quality': str, 'qual_ints': check_np_type('int64'), 'error': check_np_type('float64'), 'description': str }) #get_object = _id index = ['id'] columns = ('id', 'seq', 'quality', 'description', 'qual_ints', 'error') SANGER = True get_id = attr('id') get_seq = compose(str, attr('seq')) get_qual_ints = compose_all(np.array, itemgetter('phred_quality'), attr('_per_letter_annotations')) get_description = attr('description') get_quality = SeqIO.QualityIO._get_sanger_quality_str get_error = compose(error, get_qual_ints) #get_error = error_from_ints(get_qual_ints) getters = [ get_id, get_seq, get_quality, get_description, get_qual_ints, get_error ] assert len(getters) == len(columns) metadata = {'filename': fileh.name} iterator = get_fastq(fileh) get_raw_record = partial(next, iterator) # def get_row(record): # #record = next(fileh) ## import sys ## __module__ = sys.modules[__name__] ## get_getter = compose(attr, "get_{0}".format) ## _getters = map(get_getter, columns) ## self_getters = apply_each(_getters, __module__) #fzip(_getters, repeat(__module__, clen)) # results = apply_each(self_getters, record) # final_dict = dict(zip(columns, results)) # final_schema.validate(final_dict) # return final_dict # def load_fastq(): # fq = get_fastq(fileh) # dicts = map(get_row, fq) # return pd.DataFrame(dicts).set_index(index) #, index=index, columns=columns) #jreturn nameddict( return { 'obj_func': get_raw_record, 'columns': columns, 'getters': getters, 'validator': final_schema, 'dictgetters': None }
def fqframe(fileh): final_schema = Schema({ 'id' : str, 'seq' : str, 'quality' : str, 'qual_ints' : check_np_type('int64'), 'error' : check_np_type('float64'), 'description' : str }) #get_object = _id index = ['id'] columns = ('id', 'seq', 'quality', 'description', 'qual_ints', 'error') SANGER = True get_id = attr('id') get_seq= compose(str, attr('seq')) get_qual_ints = compose_all(np.array, itemgetter('phred_quality'), attr('_per_letter_annotations')) get_description = attr('description') get_quality = SeqIO.QualityIO._get_sanger_quality_str get_error = compose(error, get_qual_ints) #get_error = error_from_ints(get_qual_ints) getters = [get_id, get_seq, get_quality, get_description, get_qual_ints, get_error] assert len(getters) == len(columns) metadata = {'filename' : fileh.name} iterator = get_fastq(fileh) get_raw_record = partial(next, iterator) # def get_row(record): # #record = next(fileh) ## import sys ## __module__ = sys.modules[__name__] ## get_getter = compose(attr, "get_{0}".format) ## _getters = map(get_getter, columns) ## self_getters = apply_each(_getters, __module__) #fzip(_getters, repeat(__module__, clen)) # results = apply_each(self_getters, record) # final_dict = dict(zip(columns, results)) # final_schema.validate(final_dict) # return final_dict # def load_fastq(): # fq = get_fastq(fileh) # dicts = map(get_row, fq) # return pd.DataFrame(dicts).set_index(index) #, index=index, columns=columns) #jreturn nameddict( return { 'obj_func' : get_raw_record, 'columns' : columns, 'getters' : getters, 'validator' : final_schema, 'dictgetters' : None }
def parse_cigar(cigar_str): #? makes the regex not be too greedy cigar_regex = r'(?:([0-9]+)([MIDNSHPX=]))+?' reg = re.compile(cigar_regex) tups = reg.findall(cigar_str) key, value = itemgetter(1), itemgetter(0) groups = groupby(sorted(tups, key=key), key) get_counts = pmap(compose(int, itemgetter(0))) sum_counts = compose(sum, get_counts) s = "cigar_{0}".format cigar_dict = dict( (s(name), sum_counts(nums)) for name, nums in groups) #print cigar_dict mismatches = sum(num for k, num in cigar_dict.items() if k not in ['cigar_M', 'cigar_=']) return merge_dicts(cigar_dict, {'cigar_score': mismatches})
def fqframe(fileh): final_schema = Schema({ 'id': str, 'seq': str, 'quality': str, 'qual_ints': check_np_type('int64'), 'error': check_np_type('float64'), 'description': str }) #get_object = _id index = ['id'] columns = ('id', 'seq', 'quality', 'description', 'qual_ints', 'error') SANGER = True get_id = attr('id') get_seq = compose(str, attr('seq')) get_qual_ints = compose_all(np.array, itemgetter('phred_quality'), attr('_per_letter_annotations')) get_description = attr('description') get_quality = SeqIO.QualityIO._get_sanger_quality_str get_error = compose(error, get_qual_ints) #get_error = error_from_ints(get_qual_ints) def get_row(record): #record = next(fileh) print(get_funcs()) import sys __module__ = sys.modules[__name__] get_getter = compose(attr, "get_{0}".format) _getters = map(get_getter, columns) self_getters = apply_each( _getters, __module__) #fzip(_getters, repeat(__module__, clen)) results = apply_each(self_getters, record) final_dict = dict(zip(columns, results)) final_schema.validate(final_dict) return final_dict def load_fastq(): fq = get_fastq(fileh) dicts = map(get_row, fq) return pd.DataFrame(dicts).set_index( index) #, index=index, columns=columns) return namedtuple('FastqFrame', ['get_row', 'load_fastq'])( get_row, load_fastq) #{'get_row' : get_row, 'load_fastq' : load_fastq}
def fqframe(fileh): final_schema = Schema({ 'id' : str, 'seq' : str, 'quality' : str, 'qual_ints' : check_np_type('int64'), 'error' : check_np_type('float64'), 'description' : str }) #get_object = _id index = ['id'] columns = ('id', 'seq', 'quality', 'description', 'qual_ints', 'error') SANGER = True get_id = attr('id') get_seq= compose(str, attr('seq')) get_qual_ints = compose_all(np.array, itemgetter('phred_quality'), attr('_per_letter_annotations')) get_description = attr('description') get_quality = SeqIO.QualityIO._get_sanger_quality_str get_error = compose(error, get_qual_ints) #get_error = error_from_ints(get_qual_ints) def get_row(record): #record = next(fileh) print(get_funcs()) import sys __module__ = sys.modules[__name__] get_getter = compose(attr, "get_{0}".format) _getters = map(get_getter, columns) self_getters = apply_each(_getters, __module__) #fzip(_getters, repeat(__module__, clen)) results = apply_each(self_getters, record) final_dict = dict(zip(columns, results)) final_schema.validate(final_dict) return final_dict def load_fastq(): fq = get_fastq(fileh) dicts = map(get_row, fq) return pd.DataFrame(dicts).set_index(index) #, index=index, columns=columns) return namedtuple('FastqFrame', ['get_row', 'load_fastq'])(get_row, load_fastq)#{'get_row' : get_row, 'load_fastq' : load_fastq}
def get_row(record): #record = next(fileh) print(get_funcs()) import sys __module__ = sys.modules[__name__] get_getter = compose(attr, "get_{0}".format) _getters = map(get_getter, columns) self_getters = apply_each(_getters, __module__) #fzip(_getters, repeat(__module__, clen)) results = apply_each(self_getters, record) final_dict = dict(zip(columns, results)) final_schema.validate(final_dict) return final_dict
def get_row(record): #record = next(fileh) print(get_funcs()) import sys __module__ = sys.modules[__name__] get_getter = compose(attr, "get_{0}".format) _getters = map(get_getter, columns) self_getters = apply_each( _getters, __module__) #fzip(_getters, repeat(__module__, clen)) results = apply_each(self_getters, record) final_dict = dict(zip(columns, results)) final_schema.validate(final_dict) return final_dict
from operator import itemgetter from functools import partial import operator as op from operator import add, div from schema import Schema, Use from itertools import ifilter # Parse options #from pyparsing import Regex to_np_int = partial(np.array, dtype=int) parse_array = compose_all(to_np_int, psplit(','), pstrip('[]')) tabsplit = psplit('\t') minus33 = partial(add, -33) qual_int_sanger = compose(minus33, ord) qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger)) error = compose(partial(pow, 10), partial2(div, -10.0)) #don't need to map because numpy vectorizes it automatically #TODO: handle non-sanger version sanger_qual_str_to_error = compose(error, qual_to_phreds) basic_scheme={ 'QNAME' : str, 'FLAG' : int, 'RNAME' : str, 'POS' : int, 'MAPQ' : int, 'CIGAR' : str, 'RNEXT' : str, 'PNEXT' : int,
from operator import methodcaller as mc, ne import numpy as np import networkx as nx from matplotlib import pyplot as plt def to_adj_list(G, edgekey='weight', bothways=True, as_float=False): edges = sorted(ifilterfalse(lambda x: x[0] ==x[1], G.edges(data=True))) if as_float: res = map(lambda x: (x[0], x[1], float(x[-1]['weight'])), edges) form = "{0}->{1}:{2:.3f}\n{1}->{0}:{2:.3f}".format else: res = map(lambda x: (x[0], x[1], int(x[-1]['weight'])), edges) form="{0}->{1}:{2}".format if not bothways else "{0}->{1}:{2}\n{1}->{0}:{2}".format return starmap(form, res) adj_str = compose('\n'.join, to_adj_list) def parseM(raw): '''parse & return a space-seperated matrix.''' _in = filter(bool, raw.split('\n')) return np.matrix(map(pmap(float), map(str.split, _in))) def quantify(iterable, pred=bool): '''https://docs.python.org/2/library/itertools.html#recipes "Count how many times the predicate is true"''' return sum(imap(pred, iterable)) def drawgraph(G, edgekey='weight', big=False, **kwargs): if big: fig = plt.figure(figsize = (15, 10)) pos=nx.spring_layout(G) nx.draw_networkx(G, pos=pos, **kwargs)
from func import compose, partial2 import numpy as np from functools import partial import operator as op from itertools import islice, ifilter from fn.iters import splitby from operator import methodcaller from func import compose_all, pmap, psplit, _not norm_matrix = partial(np.linalg.norm, axis=1) dist_matrix = compose(norm_matrix, op.sub) min_dist_position = compose(np.argmin, dist_matrix) def gravity(M): return M.sum(axis=0)/float(len(M)) def k_means_cluster(data, centers=None, k=None): assert (centers is not None or k is not None) if centers is None: centers = np.empty((k, data.shape[1])) centers[:] = data[:k] old_centers = centers.copy() mincenters = partial(min_dist_position, centers) data_by_cluster = np.apply_along_axis(mincenters, 1, data) for i, _ in enumerate(centers): cluster = data[data_by_cluster == i] centers[i] = gravity(cluster) if np.allclose(old_centers, centers): return centers, data_by_cluster
def col_compare(df, col, value, comp): half = partial(comp, value) boolean = compose(half, df.__getitem__) return boolean(col)
Join fastq and SAM (merge on QNAME [and SEQ]) Join VCF and SAM (merge on POS) Pileup Join VCF and Pileup ''' ''' pcompose = partial(partial, compose) error_from_ints = pcompose(error) #sanger_qual_str_to_error = cmperror(qual_to_phreds) ''' get_fastq = partial(SeqIO.parse, format='fastq') get_fasta = partial(SeqIO.parse, format='fasta') to_np_int = partial(np.array, dtype=int) gccontent = compose(ilen, pifilter('GC'.__contains__)) minus33 = partial(add, -33) qual_int_sanger = compose(minus33, ord) ''' Error = 10^-(Phred/10) ''' qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger)) error = compose(partial(pow, 10), partial2(div, -10.0)) #don't need to map because numpy vectorizes it automatically #TODO: handle non-sanger version sanger_qual_str_to_error = compose(error, qual_to_phreds) #SANGER_OFFSET = 33
''' Join fastq and SAM (merge on QNAME [and SEQ]) Join VCF and SAM (merge on POS) Pileup Join VCF and Pileup ''' ''' pcompose = partial(partial, compose) error_from_ints = pcompose(error) #sanger_qual_str_to_error = cmperror(qual_to_phreds) ''' get_fastq = partial(SeqIO.parse, format='fastq') get_fasta = partial(SeqIO.parse, format='fasta') to_np_int = partial(np.array, dtype=int) gccontent = compose(ilen, pifilter('GC'.__contains__)) minus33 = partial(add, -33) qual_int_sanger = compose(minus33, ord) ''' Error = 10^-(Phred/10) ''' qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger)) error = compose(partial(pow, 10), partial2(div, -10.0)) #don't need to map because numpy vectorizes it automatically #TODO: handle non-sanger version sanger_qual_str_to_error = compose(error, qual_to_phreds) #SANGER_OFFSET = 33 ''' assert len(quality) == len(error) == len(phred_scores) '''
0x1 : "template having multiple segments in sequencing", 0x2 : "each segment properly aligned according to the aligner", 0x4 : "segment unmapped", 0x8 : "next segment in the template unmapped", 0x10 : "SEQ being reverse complemented", 0x20 : "SEQ of the next segment in the template being reversed", 0x40 : "the rst segment in the template", 0x80 : "the last segment in the template", 0x100: "secondary alignment", 0x200: "not passing quality controls", 0x400: "PCR or optical duplicate", 0x800: "supplementary alignment" } eval_flag = compose(bool, op.and_) def flag_dict(flag): return dict((meaning, eval_flag(bit, flag)) for bit, meaning in flag_meanings.items()) def split_list(A, idx): return A[:idx], A[idx:] sam_columns = ("QNAME", "FLAG", "RNAME", "POS", "MAPQ", "CIGAR", "RNEXT", "PNEXT", "TLEN", "SEQ", "QUAL") #optiosn #TODO: get_record function takes a filehandle and returns a single record via SeqIO, etc. #So functions expect a dictionary I guess #pass parse_options = compose(dict, pmap(parse_option)) #, tabsplit) #readfields = compose(tabsplit, next) line_to_dict = compose_all(dict, partial(zip, sam_columns)) #, tabsplit)
nx.draw_networkx(G, pos=pos, **kwargs) if edgekey: edge_labels=dict([((u,v,),d.get(edgekey, '')) for u,v,d in G.edges(data=True)]) nx.draw_networkx_edge_labels(G,pos,edge_labels=edge_labels)#, **kwargs) plt.show() #NOTE: requires graphviz #nx.write_dot(G,'graph.dot') #dot -Tpng graph.dot > graph.png def info_fromkmer(kmer): node1, node2 = kmer[:-1], kmer[1:] return node1, node2, {'kmer' : kmer} yield_pathgraph = compose(F(imap, info_fromkmer), slider) #pathlist = compose(list, yield_pathgraph) #use reduce def make_debruijn(s, k=None): G = nx.MultiDiGraph() if not k: G.add_edges_from(imap(info_fromkmer, s)) else: #build straight from string G.add_edges_from(yield_pathgraph(s, k)) return G ''' set v to some random node.
def test(A, B, trafo=None): print "A=", A print "B=", B from pts.pes.mueller_brown import MB from pts.pes.mueller_brown import show_chain x = [A, B] # change coordinates: if trafo is not None: from func import compose MB = compose(MB, trafo) x = array(map(trafo.pinv, x)) def show(x): if trafo is not None: show_chain(map(trafo, x)) else: show_chain(x) from numpy import savetxt def callback(x, e, g, t): # savetxt("path.txt", x) # print "chain spacing=", spacing(x) pass from path import MetricPath # for respacing from rc import Linear # as reaction coordinate rcoord = Linear([1., -1.]) from metric import Metric mt = Metric(rcoord) n = 3 n_max = 30 while True: # # Respace vertices based on custom metric built from the # definition of reaction coordinate: # p = MetricPath(x, mt.norm_up) x = array(map(p, linspace(0., 1., n))) print "BEFORE, rc(x)=", map(rcoord, x) show(x) # x = respace(x, tangent4, spacing) # print "RESPACE, x=", x # print "spacing(x)=", spacing(x) # show(x) # x, stats = soptimize(MB, x, tangent1, spacing, maxit=20, maxstep=0.1, callback=callback) # x, stats = soptimize(MB, x, tangent4, maxit=20, maxstep=0.1, callback=callback) x, stats = soptimize(MB, x, tangent4, rc=rcoord, maxit=20, maxstep=0.1, callback=callback) savetxt("mb-path.txt-" + str(len(x)), x) print "AFTER, rc(x)=", map(rcoord, x) show(x) if n < n_max: # double the number of beads: n = 2 * n + 1 else: print "=========================================================" print "Conveged for the maximal tested number of beads: ", n print "=========================================================" break
'bats', 'beagle-lib', 'beast/beast', 'beast/BEASTv1.8.0', 'bio_pieces', 'blast/blast-2.2.30+', 'bowtie/bowtie-2.2.5', 'bwa/bwa-0.7.12-r1044', 'cuda/cuda', 'cuda/cuda_6.5.14', 'igv/igv-2.3.37', 'igv/igv-2.3.52', 'mrsnbactpipeline', 'ngs_mapper/ngs_mapper-1.1', 'ngs_mapper/ngs_mapper-1.2', 'pathdiscov/pathdiscov-4.2', 'pypbs', 'usamriidPathDiscov', 'vdbstatus', 'ray/ray-2.3.1', 'roche/analysis', 'roche/analysis-v2.9', 'roche/gsprocessor-v2.9', 'samtools/samtools-1.1' ] ngs_mapper_cmd = ''' cd $PBS_O_WORKDIR mkdir -p $(pwd)/tmp SAMPLEDIR=/media/VD_Research/NGSData/ReadsBySample/${SAMPLENAME} TMPDIR=$(pwd)/tmp runsample.py $SAMPLEDIR {REFPATH} {SAMPLENAME} -od {SAMPLENAME} ''' expand_path = compose(os.path.realpath, os.path.expanduser) ''' Tab completion for directories ''' def glob_complete(text, state): expanded_text = expand_path(text) if os.path.isdir(expanded_text): expanded_text += '/' return (glob(expanded_text + '*') + [None])[state] readline.set_completer_delims(' \t\n;') readline.parse_and_bind("tab: complete") readline.set_completer(glob_complete) prompt = compose(raw_input, "{0}>".format)
'pypbs', 'usamriidPathDiscov', 'vdbstatus', 'ray/ray-2.3.1', 'roche/analysis', 'roche/analysis-v2.9', 'roche/gsprocessor-v2.9', 'samtools/samtools-1.1'] ngs_mapper_cmd = ''' cd $PBS_O_WORKDIR mkdir -p $(pwd)/tmp SAMPLEDIR=/media/VD_Research/NGSData/ReadsBySample/${SAMPLENAME} TMPDIR=$(pwd)/tmp runsample.py $SAMPLEDIR {REFPATH} {SAMPLENAME} -od {SAMPLENAME} ''' expand_path = compose(os.path.realpath, os.path.expanduser) ''' Tab completion for directories ''' def glob_complete(text, state): expanded_text = expand_path(text) if os.path.isdir(expanded_text): expanded_text += '/' return (glob(expanded_text+'*')+[None])[state] readline.set_completer_delims(' \t\n;') readline.parse_and_bind("tab: complete") readline.set_completer(glob_complete) prompt = compose(raw_input, "{0}>".format) def getvar(varname): return os.environ.get(varname, None) or prompt(varname)
other_rows = range(0, j) + range(j+1, D.shape[0]) iks = get_products(other_rows) j_parent = partial(parent_len, D, j) return min(starmap(j_parent, iks)) from numpy import nan def test_add_phylo_2D(): _in = np.array([ [0, 3, 5 ], [3, 0, nan], [5, nan, 0] ]) expected='''0->1:3 0->2:5 ''' actual = additive_phylo(_in, 2) assert expected == actual filterfst = compose(next, ifilter) def str_row(D, j): row = D[j] p = (str(j)+"->{0}:{1}").format return '\n'.join(starmap(p, enumerate(row))) def str_matrix(D): d_str = partial(str_row, D) return '\n'.join(map(d_str, xrange(D.shape[0]))) # method that gets node with matching distance def get_match_dst(D, j, dist): assert dist != 0 return (D[j] == dist).argmax() #return D[i, (D[i] == dist)]
from functools import partial import numpy as np from func import compose #next argument is the size simulate_prizedoor = partial(np.random.randint, 0, 3) random_col_vals = partial(np.apply_along_axis, np.random.choice, 1) simulate_guess = np.ones rowchoice = compose(np.random.choice, np.ma.compressed) RUNS = 1000 def goat_doors(pzs, gss): grid = np.repeat(np.ma.arange(3), RUNS).reshape(3, RUNS) unpicked_matrix = (grid == pzs) | (grid == gss) grid.mask = unpicked_matrix return np.array(map(rowchoice, grid.T)) switch_guess = goat_doors def win_percentage(pzs, gss): #return (pzs == gss).sum()/float(len(gss)) return 100*(pzs == gss).mean() def sim_game(switch=False): pzs, gss = simulate_prizedoor(RUNS), simulate_guess(RUNS) goats = goat_doors(pzs, gss) picks = switch_guess(gss, goats) if switch else gss return win_percentage(picks, pzs) print sim_game(True)
from fn import _, F from fn.iters import take, accumulate from utils import slider from assembly import drawgraph from numpy import nan def to_adj_list(G, edgekey='weight', bothways=True, as_float=False): edges = sorted(ifilterfalse(lambda x: x[0] ==x[1], G.edges(data=True))) if as_float: res = map(lambda x: (x[0], x[1], float(x[-1]['weight'])), edges) form = "{0}->{1}:{2:.3f}\n{1}->{0}:{2:.3f}".format else: res = map(lambda x: (x[0], x[1], int(x[-1]['weight'])), edges) form="{0}->{1}:{2}".format if not bothways else "{0}->{1}:{2}\n{1}->{0}:{2}".format return starmap(form, res) adj_str = compose('\n'.join, to_adj_list) def fst_or_none(func, seq): res = filter(func, seq) return None if not res else res[0] filterfst = compose(next, ifilter) def nondiag(D, i):return list(set(range(len(D))) - set([i]) ) ndiag_perms = compose(get_products, nondiag) def limbmatch(D, n): ''' find nodes i and k such that they satisfie the linear equation: D_ik = D_in + D_nk''' #print np.isnan(D[n]).all() def match(tup): i, k = tup
{ 'A' : chr, 'i' : int, 'f' : float, 'Z' : str, 'H' : int, # hex 'B' : parse_array } #parse cigar string cigar_regex = r'(?:([0-9]+)([MIDNSHPX=]))+?' reg = re.compile(cigar_regex) tups = reg.findall('15S213M23S') key,value = itemgetter(1), itemgetter(0) groups = groupby(sorted(tups, key=key), key) get_counts = pmap(compose(int, itemgetter(0))) sum_counts = compose(sum, get_counts) cigar_dict = dict( (name, sum_counts(nums)) for name, nums in groups) mismatches = sum(num for key, num in cigar_dict.items() if key not in 'M=') #dictmap(compose(sum, get_counts), dict(groups)) #sum(starmap(to_cigar, tups)) #dict(map(reverse, tups)) ''' assert sum(itemgetter('M', 'I', 'S', '=', 'X')) == len(seq) == len(quality), \ "cigar string M/I/S/=/X should sum to the length of the query sequence." ''' #TODO: parse flag #TODO: handle empty cases (unmapped reads, *) index = ['QNAME', 'POS', 'REF']