Exemple #1
0
def fa_cuts(fname,e1,e2):
    listfile = '%s_%s-%s.list' % (fname,e1,e2)
    if os.path.exists(listfile):
        print >> sys.stderr, '%s found, load fragment lengths ...' % listfile,
        fraglens = eval(open(listfile).read())
        print >> sys.stderr, '%s fragments loaded' % len(fraglens)
        return fraglens
    
    fraglens = []
    this = []
    seqnum = 0
    for l in open(fname):
        if l[0] == '>':
            if this:
                seq = ''.join(this)
                fraglens.extend(fraglen_from_seq(seq,e1,e2))
                this = []
            sname = l[1:].strip()
            seqnum += 1
            outn = ' (sequence %s)' % (seqnum)
            om = '\r%s %s' % (sname[:80-len(outn)], outn)
            print >> sys.stderr, om, ' '*(80-len(om)+1),
        else:
            this.append(l.strip().upper())
        
    seq = ''.join(this)
    fraglens.extend(fraglen_from_seq(seq,e1,e2))
    print >> sys.stderr, 'store %s fragment lengths to %s' % (len(fraglens),listfile)
    open(listfile,'w').write(fraglens.__repr__())
    return fraglens
Exemple #2
0
def fa_cuts(fname, e1, e2):
    listfile = '%s_%s-%s.list' % (fname, e1, e2)
    if os.path.exists(listfile):
        print >> sys.stderr, '%s found, load fragment lengths ...' % listfile,
        fraglens = eval(open(listfile).read())
        print >> sys.stderr, '%s fragments loaded' % len(fraglens)
        return fraglens

    fraglens = []
    this = []
    seqnum = 0
    for l in open(fname):
        if l[0] == '>':
            if this:
                seq = ''.join(this)
                fraglens.extend(fraglen_from_seq(seq, e1, e2))
                this = []
            sname = l[1:].strip()
            seqnum += 1
            outn = ' (sequence %s)' % (seqnum)
            om = '\r%s %s' % (sname[:80 - len(outn)], outn)
            print >> sys.stderr, om, ' ' * (80 - len(om) + 1),
        else:
            this.append(l.strip().upper())

    seq = ''.join(this)
    fraglens.extend(fraglen_from_seq(seq, e1, e2))
    print >> sys.stderr, 'store %s fragment lengths to %s' % (len(fraglens),
                                                              listfile)
    open(listfile, 'w').write(fraglens.__repr__())
    return fraglens
def load_cluster_data(gr,tab,mID=None):
    '''given an mcl graph output file, an mcl tab file of sequence labels, and an mID file of individuals per label,

    returns (clusters,labels,mID_by_label) dicts
    '''

    #extract clusters from mcl output
    print >>sys.stderr, 'load graph...',
    fc = open(gr).read()
    print >> sys.stderr, 'done\nparse graph...',
    body = re.search('begin(.+?)\)',fc.replace('\n',' ')).groups()[0]
    clusters = dict([(s.strip().split()[0],s.strip().split()[1:]) for s in body.strip().split('$') if len(s.strip().split()) > 1])
    print >> sys.stderr, 'done\nload labels...',

    #cluster labels
    labels = dict([l.strip().split() for l in open(tab).readlines()])

    print >> sys.stderr, 'done\nload mIDs...',
    #individuals by labels
    if mID is None:
        mID_by_label = None
    else:
        mID_by_label = dict([(l.split()[0],l.split()[1:]) for l in open(mID).readlines()])
    print >> sys.stderr, 'done'

    return clusters,labels,mID_by_label
Exemple #4
0
def load_cluster_data(gr, tab, mID=None):
    '''given an mcl graph output file, an mcl tab file of sequence labels, and an mID file of individuals per label,

    returns (clusters,labels,mID_by_label) dicts
    '''

    #extract clusters from mcl output
    print >> sys.stderr, 'load graph...',
    fc = open(gr).read()
    print >> sys.stderr, 'done\nparse graph...',
    body = re.search('begin(.+?)\)', fc.replace('\n', ' ')).groups()[0]
    clusters = dict([(s.strip().split()[0], s.strip().split()[1:])
                     for s in body.strip().split('$')
                     if len(s.strip().split()) > 1])
    print >> sys.stderr, 'done\nload labels...',

    #cluster labels
    labels = dict([l.strip().split() for l in open(tab).readlines()])

    print >> sys.stderr, 'done\nload mIDs...',
    #individuals by labels
    if mID is None:
        mID_by_label = None
    else:
        mID_by_label = dict([(l.split()[0], l.split()[1:])
                             for l in open(mID).readlines()])
    print >> sys.stderr, 'done'

    return clusters, labels, mID_by_label
Exemple #5
0
def load_lines_from_uniqued(source_uniques,
                            rv_sort=True,
                            sort_key=lambda x: (len(x[0]), int(x[1])),
                            keep_source_id=False):
    '''
    if keep_source_id is True
        returns list of 2-tuples uniqued_id (eg 100617_lane6_PE for "data/100617/100617_lane6_PE.uniqued")
        tuples are (parsed_lines,uniqued_id)

    else list of lines.
    '''
    uniquedlines = []
    for f in source_uniques:
        lines = []

        print >> sys.stderr, 'load %s ...' % f,
        lines = tuple([l.strip().split() for l in open(f).readlines()])
        print >> sys.stderr, '%s lines' % len(lines)

        #get qual base
        baseQ = None
        for l in lines:
            baseQ = get_baseQ(l[2])
            if baseQ is not None:
                break
        print >> sys.stderr, 'qual base: %s' % baseQ

        if baseQ == 64:
            print >> sys.stderr, 'Translate quality encoding to base 33 ...',
            for l in lines:
                l[2] = ''.join([chr(ord(c) - 64 + 33) for c in l[2]])
            print >> sys.stderr, 'done'

        if keep_source_id:
            uniqued_id = os.path.basename(os.path.splitext(f)[0])
            uniquedlines.extend(zip(lines, [uniqued_id] * len(lines)))
        else:
            uniquedlines.extend(lines)

    print >> sys.stderr, 'sort',
    if keep_source_id:
        uniquedlines.sort(reverse=rv_sort, key=lambda x: sort_key(x[0]))
    else:
        uniquedlines.sort(reverse=rv_sort, key=sort_key)
    print >> sys.stderr, 'done'
    return uniquedlines
def load_lines_from_uniqued(source_uniques,rv_sort = True, sort_key = lambda x: (len(x[0]),int(x[1])), keep_source_id = False):
    '''
    if keep_source_id is True
        returns list of 2-tuples uniqued_id (eg 100617_lane6_PE for "data/100617/100617_lane6_PE.uniqued")
        tuples are (parsed_lines,uniqued_id)

    else list of lines.
    '''
    uniquedlines = []
    for f in source_uniques:
        lines = []

        print >> sys.stderr, 'load %s ...' % f,
        lines = tuple([l.strip().split() for l in open(f).readlines()])
        print >> sys.stderr, '%s lines' % len(lines)

        #get qual base
        baseQ = None
        for l in lines:
        	baseQ = get_baseQ(l[2])
        	if baseQ is not None:
        		break
        print >> sys.stderr, 'qual base: %s' % baseQ
        
        if baseQ == 64: 
        	print >> sys.stderr, 'Translate quality encoding to base 33 ...',
        	for l in lines:
        		l[2] = ''.join([chr(ord(c)-64+33) for c in l[2]])
        	print >> sys.stderr, 'done'
        
        if keep_source_id:
            uniqued_id = os.path.basename(os.path.splitext(f)[0])
            uniquedlines.extend( zip( lines,[uniqued_id]*len(lines) ) )
        else:
            uniquedlines.extend(lines)

    print >> sys.stderr, 'sort',
    if keep_source_id:
        uniquedlines.sort(reverse = rv_sort,key = lambda x: sort_key(x[0]))
    else:
        uniquedlines.sort(reverse = rv_sort,key = sort_key)
    print >> sys.stderr, 'done'
    return uniquedlines