def parse_cigar(cigar_str): #? makes the regex not be too greedy cigar_regex = r'(?:([0-9]+)([MIDNSHPX=]))+?' reg = re.compile(cigar_regex) tups = reg.findall(cigar_str) key, value = itemgetter(1), itemgetter(0) groups = groupby(sorted(tups, key=key), key) get_counts = pmap(compose(int, itemgetter(0))) sum_counts = compose(sum, get_counts) s = "cigar_{0}".format cigar_dict = dict( (s(name), sum_counts(nums)) for name, nums in groups) #print cigar_dict mismatches = sum(num for k, num in cigar_dict.items() if k not in ['cigar_M', 'cigar_=']) return merge_dicts(cigar_dict, {'cigar_score': mismatches})
def parseM(raw): '''parse & return a space-seperated matrix.''' _in = filter(bool, raw.split('\n')) return np.matrix(map(pmap(float), map(str.split, _in)))
def walk(G, vstd, cycle, start, current=None, call=0): #TODO: I think this leaves out the final step of the cycle. if start == current: return vstd, cycle# + tuple([current]) #NOTE: checking for boolean of 0 is bad here haha #_current = start if current else current _current = start if current is None else current candidates = set(G.edges(_current)) - vstd #candidates = filterfalse(vstd.__contains__, G.neighbors(current)) edge = random.choice(tuple(candidates)) nn = edge[1] return walk(G, vstd | set([edge]), cycle + tuple([nn]), start, nn, call+1) filterfst = compose(next, ifilter) def edges_of_path(G, p): return map(X[0]['kmer'], starmap(F(G.get_edge_data), slider(p, 2))) reconstruct_str = compose_all(''.join, pmap(''.join), edges_of_path) def e_cycle(G, vstd=set(), cycle=(), call=0): ''' find a Eulerian path in a graph by iteratively expanding a cycle. requires a mostly-balanced and connected graph.''' if len(vstd) == len(G.edges()): return cycle def valid(N): edges=G.edges(N) return not (set(edges) <= vstd) #return bool(map(F(filterfalse, vstd.__contains__), edges)) if not cycle: valid_start = random.choice(G.nodes()) # 6 cycle = tuple([valid_start]) else: valid_start = filterfst(valid, cycle)
# return ddist(centers).argmin() # #return min(centers, key=ddist) def makematrices(s): _centers, _data = splitby(_not(isin('------')), ifilter(bool, s)) #centers = map(makenp, islice(_centers, 1, None)) #data = map(makenp, islice(_data, 1, None)) centers = makenp(islice(_centers, 1, None)) data = makenp(islice(_data, 1, None)) return centers, data isin = partial(methodcaller, '__contains__') makearray = compose_all(np.array, pmap(np.array), pmap(float), psplit(' ')) makenp = compose(np.array, pmap(makearray)) def get_in_out(s): raw_in, raw_out = splitby(_not(isin('Output')), ifilter(bool, s)) k = int(next(raw_in).split(' ')[0]) _in = makenp(raw_in) _out =makenp(islice(raw_out, 1, None)) return _in, _out, k lines = open('Lloyd.txt').readlines() input, expected, k = get_in_out(lines) print soft_k_means_cluster(input, k=3) from matplotlib import pyplot
eval_flag = compose(bool, op.and_) def flag_dict(flag): return dict((meaning, eval_flag(bit, flag)) for bit, meaning in flag_meanings.items()) def split_list(A, idx): return A[:idx], A[idx:] sam_columns = ("QNAME", "FLAG", "RNAME", "POS", "MAPQ", "CIGAR", "RNEXT", "PNEXT", "TLEN", "SEQ", "QUAL") #optiosn #TODO: get_record function takes a filehandle and returns a single record via SeqIO, etc. #So functions expect a dictionary I guess #pass parse_options = compose(dict, pmap(parse_option)) #, tabsplit) #readfields = compose(tabsplit, next) line_to_dict = compose_all(dict, partial(zip, sam_columns)) #, tabsplit) validated_dict = compose(basic_schema.validate, line_to_dict) fields_and_options = compose(partial2(split_list, len(sam_columns)), tabsplit) parsers = partial(fzip, [validated_dict, parse_options]) parse_fields_and_options = compose(parsers, fields_and_options) all_but_cigar_dict = starcompose(merge_dicts, parse_fields_and_options) get_cigar_dict = compose(parse_cigar, itemgetter('CIGAR')) get_flag_dict = compose(flag_dict, itemgetter('FLAG')) get_error = compose(sanger_qual_str_to_error, itemgetter('QUAL')) def load_sam(fh): dicts = map(get_row, ifilter(bool, fh.read().split('\n'))) return pd.DataFrame(dicts) #TODO: do we really need indices? it complicates querying i tlooks like maybe where plays better with them
''' pcompose = partial(partial, compose) error_from_ints = pcompose(error) #sanger_qual_str_to_error = cmperror(qual_to_phreds) ''' get_fastq = partial(SeqIO.parse, format='fastq') get_fasta = partial(SeqIO.parse, format='fasta') to_np_int = partial(np.array, dtype=int) gccontent = compose(ilen, pifilter('GC'.__contains__)) minus33 = partial(add, -33) qual_int_sanger = compose(minus33, ord) ''' Error = 10^-(Phred/10) ''' qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger)) error = compose(partial(pow, 10), partial2(div, -10.0)) #don't need to map because numpy vectorizes it automatically #TODO: handle non-sanger version sanger_qual_str_to_error = compose(error, qual_to_phreds) #SANGER_OFFSET = 33 ''' assert len(quality) == len(error) == len(phred_scores) '''
''' ''' pcompose = partial(partial, compose) error_from_ints = pcompose(error) #sanger_qual_str_to_error = cmperror(qual_to_phreds) ''' get_fastq = partial(SeqIO.parse, format='fastq') get_fasta = partial(SeqIO.parse, format='fasta') to_np_int = partial(np.array, dtype=int) gccontent = compose(ilen, pifilter('GC'.__contains__)) minus33 = partial(add, -33) qual_int_sanger = compose(minus33, ord) ''' Error = 10^-(Phred/10) ''' qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger)) error = compose(partial(pow, 10), partial2(div, -10.0)) #don't need to map because numpy vectorizes it automatically #TODO: handle non-sanger version sanger_qual_str_to_error = compose(error, qual_to_phreds) #SANGER_OFFSET = 33 ''' assert len(quality) == len(error) == len(phred_scores) ''' #validate = scheme.validate #TODO: could make these validations match samtools spec #TODO: Could treat options/cigar string as their own class with their own parsing and validation.
def parsematrix(raw): _in = filter(bool, raw.split('\n')) return np.ma.array(map(pmap(int), map(str.split, _in)), mask=False)
from functools import partial import operator as op from operator import add, div from schema import Schema, Use from itertools import ifilter # Parse options #from pyparsing import Regex to_np_int = partial(np.array, dtype=int) parse_array = compose_all(to_np_int, psplit(','), pstrip('[]')) tabsplit = psplit('\t') minus33 = partial(add, -33) qual_int_sanger = compose(minus33, ord) qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger)) error = compose(partial(pow, 10), partial2(div, -10.0)) #don't need to map because numpy vectorizes it automatically #TODO: handle non-sanger version sanger_qual_str_to_error = compose(error, qual_to_phreds) basic_scheme={ 'QNAME' : str, 'FLAG' : int, 'RNAME' : str, 'POS' : int, 'MAPQ' : int, 'CIGAR' : str, 'RNEXT' : str, 'PNEXT' : int, 'TLEN' : int,
{ 'A' : chr, 'i' : int, 'f' : float, 'Z' : str, 'H' : int, # hex 'B' : parse_array } #parse cigar string cigar_regex = r'(?:([0-9]+)([MIDNSHPX=]))+?' reg = re.compile(cigar_regex) tups = reg.findall('15S213M23S') key,value = itemgetter(1), itemgetter(0) groups = groupby(sorted(tups, key=key), key) get_counts = pmap(compose(int, itemgetter(0))) sum_counts = compose(sum, get_counts) cigar_dict = dict( (name, sum_counts(nums)) for name, nums in groups) mismatches = sum(num for key, num in cigar_dict.items() if key not in 'M=') #dictmap(compose(sum, get_counts), dict(groups)) #sum(starmap(to_cigar, tups)) #dict(map(reverse, tups)) ''' assert sum(itemgetter('M', 'I', 'S', '=', 'X')) == len(seq) == len(quality), \ "cigar string M/I/S/=/X should sum to the length of the query sequence." ''' #TODO: parse flag #TODO: handle empty cases (unmapped reads, *) index = ['QNAME', 'POS', 'REF']