Example #1
0
def fqframe(fileh):
    final_schema =  Schema({
        'id' : str,
        'seq' : str,
        'quality' : str,
        'qual_ints' : check_np_type('int64'),
        'error' : check_np_type('float64'),
        'description' : str
    })

    #get_object = _id
    index = ['id']
    columns = ('id', 'seq', 'quality', 'description', 'qual_ints', 'error')
    SANGER = True
    get_id = attr('id')
    get_seq= compose(str, attr('seq'))
    get_qual_ints = compose_all(np.array, itemgetter('phred_quality'), attr('_per_letter_annotations'))
    get_description = attr('description')
    get_quality = SeqIO.QualityIO._get_sanger_quality_str
    get_error = compose(error, get_qual_ints)
    #get_error = error_from_ints(get_qual_ints)
    getters = [get_id, get_seq, get_quality, get_description, get_qual_ints, get_error]
    assert len(getters) == len(columns)
    metadata = {'filename' : fileh.name}
    iterator = get_fastq(fileh)
    get_raw_record = partial(next, iterator)

#    def get_row(record):
#        #record = next(fileh)
##        import sys
##        __module__ = sys.modules[__name__]
##        get_getter = compose(attr, "get_{0}".format)
##        _getters = map(get_getter, columns)
##        self_getters = apply_each(_getters, __module__) #fzip(_getters, repeat(__module__, clen))
#        results = apply_each(self_getters, record)
#        final_dict = dict(zip(columns, results))
#        final_schema.validate(final_dict)
#        return final_dict

#    def load_fastq():
#        fq = get_fastq(fileh)
#        dicts = map(get_row, fq)
#        return pd.DataFrame(dicts).set_index(index) #, index=index, columns=columns)

    #jreturn nameddict(
    return { 'obj_func' : get_raw_record,
        'columns' : columns,
        'getters' : getters,
        'validator' : final_schema,
        'dictgetters' : None
    }
Example #2
0
def fqframe(fileh):
    final_schema =  Schema({
        'id' : str,
        'seq' : str,
        'quality' : str,
        'qual_ints' : check_np_type('int64'),
        'error' : check_np_type('float64'),
        'description' : str
    })

    #get_object = _id
    index = ['id']
    columns = ('id', 'seq', 'quality', 'description', 'qual_ints', 'error')
    SANGER = True
    get_id = attr('id')
    get_seq= compose(str, attr('seq'))
    get_qual_ints = compose_all(np.array, itemgetter('phred_quality'), attr('_per_letter_annotations'))
    get_description = attr('description')
    get_quality = SeqIO.QualityIO._get_sanger_quality_str
    get_error = compose(error, get_qual_ints)
    #get_error = error_from_ints(get_qual_ints)

    def get_row(record):
        #record = next(fileh)
        print(get_funcs())
        import sys
        __module__ = sys.modules[__name__]
        get_getter = compose(attr, "get_{0}".format)
        _getters = map(get_getter, columns)
        self_getters = apply_each(_getters, __module__) #fzip(_getters, repeat(__module__, clen))
        results = apply_each(self_getters, record)
        final_dict = dict(zip(columns, results))
        final_schema.validate(final_dict)
        return final_dict

    def load_fastq():
        fq = get_fastq(fileh)
        dicts = map(get_row, fq)
        return pd.DataFrame(dicts).set_index(index) #, index=index, columns=columns)

    return namedtuple('FastqFrame', ['get_row', 'load_fastq'])(get_row, load_fastq)#{'get_row' : get_row, 'load_fastq' : load_fastq}
Example #3
0
def walk(G, vstd, cycle, start, current=None, call=0):
    #TODO: I think this leaves out the final step of the cycle.
    if start == current: return vstd, cycle# + tuple([current])
    #NOTE: checking for boolean of 0 is bad here haha
    #_current = start if current else current
    _current = start if current is None else current
    candidates = set(G.edges(_current)) - vstd
    #candidates = filterfalse(vstd.__contains__, G.neighbors(current))
    edge = random.choice(tuple(candidates))
    nn = edge[1]
    return walk(G,  vstd | set([edge]), cycle + tuple([nn]), start, nn, call+1)

filterfst = compose(next, ifilter)
def edges_of_path(G, p):
    return map(X[0]['kmer'], starmap(F(G.get_edge_data), slider(p, 2)))
reconstruct_str = compose_all(''.join, pmap(''.join), edges_of_path)

def e_cycle(G, vstd=set(), cycle=(), call=0):
    ''' find a Eulerian path in a graph by iteratively expanding a cycle.
    requires a mostly-balanced and connected graph.'''
    if len(vstd) == len(G.edges()): return cycle

    def valid(N):
        edges=G.edges(N)
        return not (set(edges) <= vstd)
        #return bool(map(F(filterfalse, vstd.__contains__), edges))
    if not cycle:
        valid_start = random.choice(G.nodes()) # 6
        cycle = tuple([valid_start])
    else:
        valid_start = filterfst(valid, cycle)
Example #4
0
import re
import pandas as pd
from bioframes import to_np_int, sanger_qual_str_to_error
from itertools import groupby
from func import pmap, psplit, pstrip, compose, compose_all, merge_dicts, fzip, partial2, dictmap, starcompose
from operator import itemgetter
from functools import partial
import operator as op
from schema import Schema, Use
from itertools import ifilter
# Parse options
#from pyparsing import Regex

parse_array = compose_all(to_np_int, psplit(','), pstrip('[]'))
tabsplit = psplit('\t')

basic_scheme={
    'QNAME' : str,
    'FLAG' : int,
    'RNAME' : str,
    'POS' : int,
    'MAPQ' : int,
    'CIGAR' : str,
    'RNEXT' : str,
    'PNEXT' : int,
    'TLEN' : int,
    #'MRNM' : str,
    #'MRNM' : '*='.__contains__,
    #'MPOS' : int,
    #'ISIZE' : int,
    'SEQ' : str,
Example #5
0
        print( func.__name__)
        print( args, kwargs)
        #print formatAllArgs(args, kwargs)
        return func(*args, **kwargs)
    return wrap

def slider(seq, window, start=0):#, stop=None):
    '''assert list(slider([0, 1, 2], 2)) == [ [0,1], [1,2] ]
    assert list(slider('ABCDE', 4)) == [ 'ABCD', 'BCDE' ]
    assert list(slider('ABCDE', 1)) == list('ABCDE')'''
    N = len(seq)
    for idx  in xrange(N-window+1):
        yield seq[idx:idx+window]

filterfst = compose(next, ifilter)
composition = compose_all(sorted, list, slider)

def fromstr(_in):
   lines = filter(str.strip, _in.split('\n'))
   k = int(lines[0])
   s = ''.join(lines[1:])
   return s, k
cfromstr = starcompose(composition, fromstr)
#assert ["AATCC", "ATCCA", "CAATC", "CCAAC", "TCCAA"] == cfromstr(r_in)

#neighobrs = filter(X[:k] == sfx, prefixg)
#NOTE: using generators over lists makes a huge difference.
def make_ovrlp_graph(kmers):
    N = len(kmers)
    ov = len(kmers[0]) - 1
    M = np.zeros((N, N))
Example #6
0
#    return ddist(centers).argmin()
#    #return min(centers, key=ddist)


def makematrices(s):
    _centers, _data = splitby(_not(isin('------')), ifilter(bool, s))
    #centers = map(makenp, islice(_centers, 1, None))
    #data = map(makenp, islice(_data, 1, None))
    centers = makenp(islice(_centers, 1, None))
    data = makenp(islice(_data, 1, None))
    return centers, data



isin = partial(methodcaller, '__contains__')
makearray = compose_all(np.array, pmap(np.array), pmap(float), psplit(' '))
makenp = compose(np.array, pmap(makearray))
def get_in_out(s):
    raw_in, raw_out = splitby(_not(isin('Output')), ifilter(bool, s))
    k = int(next(raw_in).split(' ')[0])
    _in = makenp(raw_in)
    _out =makenp(islice(raw_out, 1, None))
    return _in, _out, k


lines = open('Lloyd.txt').readlines()
input, expected, k = get_in_out(lines)
print soft_k_means_cluster(input, k=3)


from matplotlib import pyplot
Example #7
0
# method that gets node with matching distance
def get_match_dst(D, j, dist):
    assert dist != 0
    return (D[j] == dist).argmax()
    #return D[i, (D[i] == dist)]

def get_match_dists(D, j, dist):
    assert dist != 0
    return (D[j] == dist).nonzero()


def non_diag(D, j):
    return  range(0, j) + range(j+1, D.shape[0])

nondiag_products = compose_all(list, get_products, non_diag)
nondiag_products3 = compose_all(list, partial(get_products, times=3), non_diag)
products3 = compose_all(list, partial(get_products, times=3))
def additive_phyloZ(D, n):
    if n == 2:
        return  str_matrix(D)
    ll = limb_len(D, n)
    non_diag = range(0, n) + range(n+1, D.shape[0])
    D[non_diag, j] -= ll
    D[j, non_diag] -= ll
    # get matching i, n, k
    D.mask[n] = D.mask[:, n] = True
    T = additive_phylo(D, n-1)
    v_candidates_i = get_match_dst(T, i, x)
    along_path = lambda c: D[k, c] + D[i, c] == x
    v = filterfst(along_path, v_candidates_i)