def fa_cuts(fname,e1,e2): listfile = '%s_%s-%s.list' % (fname,e1,e2) if os.path.exists(listfile): print >> sys.stderr, '%s found, load fragment lengths ...' % listfile, fraglens = eval(open(listfile).read()) print >> sys.stderr, '%s fragments loaded' % len(fraglens) return fraglens fraglens = [] this = [] seqnum = 0 for l in open(fname): if l[0] == '>': if this: seq = ''.join(this) fraglens.extend(fraglen_from_seq(seq,e1,e2)) this = [] sname = l[1:].strip() seqnum += 1 outn = ' (sequence %s)' % (seqnum) om = '\r%s %s' % (sname[:80-len(outn)], outn) print >> sys.stderr, om, ' '*(80-len(om)+1), else: this.append(l.strip().upper()) seq = ''.join(this) fraglens.extend(fraglen_from_seq(seq,e1,e2)) print >> sys.stderr, 'store %s fragment lengths to %s' % (len(fraglens),listfile) open(listfile,'w').write(fraglens.__repr__()) return fraglens
def fa_cuts(fname, e1, e2): listfile = '%s_%s-%s.list' % (fname, e1, e2) if os.path.exists(listfile): print >> sys.stderr, '%s found, load fragment lengths ...' % listfile, fraglens = eval(open(listfile).read()) print >> sys.stderr, '%s fragments loaded' % len(fraglens) return fraglens fraglens = [] this = [] seqnum = 0 for l in open(fname): if l[0] == '>': if this: seq = ''.join(this) fraglens.extend(fraglen_from_seq(seq, e1, e2)) this = [] sname = l[1:].strip() seqnum += 1 outn = ' (sequence %s)' % (seqnum) om = '\r%s %s' % (sname[:80 - len(outn)], outn) print >> sys.stderr, om, ' ' * (80 - len(om) + 1), else: this.append(l.strip().upper()) seq = ''.join(this) fraglens.extend(fraglen_from_seq(seq, e1, e2)) print >> sys.stderr, 'store %s fragment lengths to %s' % (len(fraglens), listfile) open(listfile, 'w').write(fraglens.__repr__()) return fraglens
def load_cluster_data(gr,tab,mID=None): '''given an mcl graph output file, an mcl tab file of sequence labels, and an mID file of individuals per label, returns (clusters,labels,mID_by_label) dicts ''' #extract clusters from mcl output print >>sys.stderr, 'load graph...', fc = open(gr).read() print >> sys.stderr, 'done\nparse graph...', body = re.search('begin(.+?)\)',fc.replace('\n',' ')).groups()[0] clusters = dict([(s.strip().split()[0],s.strip().split()[1:]) for s in body.strip().split('$') if len(s.strip().split()) > 1]) print >> sys.stderr, 'done\nload labels...', #cluster labels labels = dict([l.strip().split() for l in open(tab).readlines()]) print >> sys.stderr, 'done\nload mIDs...', #individuals by labels if mID is None: mID_by_label = None else: mID_by_label = dict([(l.split()[0],l.split()[1:]) for l in open(mID).readlines()]) print >> sys.stderr, 'done' return clusters,labels,mID_by_label
def load_cluster_data(gr, tab, mID=None): '''given an mcl graph output file, an mcl tab file of sequence labels, and an mID file of individuals per label, returns (clusters,labels,mID_by_label) dicts ''' #extract clusters from mcl output print >> sys.stderr, 'load graph...', fc = open(gr).read() print >> sys.stderr, 'done\nparse graph...', body = re.search('begin(.+?)\)', fc.replace('\n', ' ')).groups()[0] clusters = dict([(s.strip().split()[0], s.strip().split()[1:]) for s in body.strip().split('$') if len(s.strip().split()) > 1]) print >> sys.stderr, 'done\nload labels...', #cluster labels labels = dict([l.strip().split() for l in open(tab).readlines()]) print >> sys.stderr, 'done\nload mIDs...', #individuals by labels if mID is None: mID_by_label = None else: mID_by_label = dict([(l.split()[0], l.split()[1:]) for l in open(mID).readlines()]) print >> sys.stderr, 'done' return clusters, labels, mID_by_label
def load_lines_from_uniqued(source_uniques, rv_sort=True, sort_key=lambda x: (len(x[0]), int(x[1])), keep_source_id=False): ''' if keep_source_id is True returns list of 2-tuples uniqued_id (eg 100617_lane6_PE for "data/100617/100617_lane6_PE.uniqued") tuples are (parsed_lines,uniqued_id) else list of lines. ''' uniquedlines = [] for f in source_uniques: lines = [] print >> sys.stderr, 'load %s ...' % f, lines = tuple([l.strip().split() for l in open(f).readlines()]) print >> sys.stderr, '%s lines' % len(lines) #get qual base baseQ = None for l in lines: baseQ = get_baseQ(l[2]) if baseQ is not None: break print >> sys.stderr, 'qual base: %s' % baseQ if baseQ == 64: print >> sys.stderr, 'Translate quality encoding to base 33 ...', for l in lines: l[2] = ''.join([chr(ord(c) - 64 + 33) for c in l[2]]) print >> sys.stderr, 'done' if keep_source_id: uniqued_id = os.path.basename(os.path.splitext(f)[0]) uniquedlines.extend(zip(lines, [uniqued_id] * len(lines))) else: uniquedlines.extend(lines) print >> sys.stderr, 'sort', if keep_source_id: uniquedlines.sort(reverse=rv_sort, key=lambda x: sort_key(x[0])) else: uniquedlines.sort(reverse=rv_sort, key=sort_key) print >> sys.stderr, 'done' return uniquedlines
def load_lines_from_uniqued(source_uniques,rv_sort = True, sort_key = lambda x: (len(x[0]),int(x[1])), keep_source_id = False): ''' if keep_source_id is True returns list of 2-tuples uniqued_id (eg 100617_lane6_PE for "data/100617/100617_lane6_PE.uniqued") tuples are (parsed_lines,uniqued_id) else list of lines. ''' uniquedlines = [] for f in source_uniques: lines = [] print >> sys.stderr, 'load %s ...' % f, lines = tuple([l.strip().split() for l in open(f).readlines()]) print >> sys.stderr, '%s lines' % len(lines) #get qual base baseQ = None for l in lines: baseQ = get_baseQ(l[2]) if baseQ is not None: break print >> sys.stderr, 'qual base: %s' % baseQ if baseQ == 64: print >> sys.stderr, 'Translate quality encoding to base 33 ...', for l in lines: l[2] = ''.join([chr(ord(c)-64+33) for c in l[2]]) print >> sys.stderr, 'done' if keep_source_id: uniqued_id = os.path.basename(os.path.splitext(f)[0]) uniquedlines.extend( zip( lines,[uniqued_id]*len(lines) ) ) else: uniquedlines.extend(lines) print >> sys.stderr, 'sort', if keep_source_id: uniquedlines.sort(reverse = rv_sort,key = lambda x: sort_key(x[0])) else: uniquedlines.sort(reverse = rv_sort,key = sort_key) print >> sys.stderr, 'done' return uniquedlines