def _open_resource(url_file_stream_or_string): """URL, filename, or string --> stream This function lets you define parsers that take any input source (URL, pathname to local or network file, or actual data as a string) and deal with it in a uniform manner. Returned object is guaranteed to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. """ if hasattr(url_file_stream_or_string, 'read'): return url_file_stream_or_string if url_file_stream_or_string == '-': return sys.stdin if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): request = urllib2.Request(url_file_stream_or_string) request.add_header('A-IM', 'feed') # RFC 3229 support opener = urllib2.build_opener() opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent try: return opener.open(request) finally: opener.close() # JohnD # try to open with native open function (if url_file_stream_or_string is a filename) try: return open(url_file_stream_or_string) except: pass # treat url_file_stream_or_string as string return _StringIO(str(url_file_stream_or_string))
def build_pdf(self): buf = _StringIO() sdoc = _SimpleDocTemplate(filename=buf, **self.config) if isinstance(self.elements[-1], _Spacer): del self.elements[-1] sdoc.build(self.elements, onFirstPage=self.__FirtPage, onLaterPages=self.__LaterPages) return buf.getvalue()
def pformat(self, object): fs = PrettyPrinter.pformat(self, object) rs = _StringIO() for line in fs.split('\n'): if len(line) != 0: rs.write(self.__per_line_indent_str) rs.write(line) rs.write('\n') return rs.getvalue()
def __init__(self, max_size = 0, mode = 'w+b', bufsize = -1, suffix = '', prefix = template, dir = None): self._file = _StringIO() self._max_size = max_size self._rolled = False self._TemporaryFileArgs = (mode, bufsize, suffix, prefix, dir)
def __str__(self): sf = _StringIO() self.dump(sf) rs = sf.getvalue() # turn into single line for this (yes, cheesy, but it kind of works out) fl = [] for line in rs.split('\n'): fl.append(line.strip()) ft = ' '.join(fl) return ft
def on_finished(self): info = self._make_callback_info() self.last_url = self.handle.getinfo(pycurl.EFFECTIVE_URL) if self.options.write_file is None: info['body'] = self.buffer.getvalue() if gzip and info.get('content-encoding', '') == 'gzip': try: info['body'] = gzip.GzipFile( fileobj=_StringIO(info['body'])).read() except IOError, e: logging.warning("Received header with content-encoding " "gzip, but content is not gzip encoded")
def _updaterating (): """ Update ratings. """ global updatelog log = _StringIO() try: doreload = _update_ratings(config, log=log, dryrun=False) updatelog = log.getvalue() # XXX except StandardError: updatelog = log.getvalue() updatelog += _traceback.format_exc()
def __read_data__(self): """Reads the XML document. """ if self.__data: return fd = self.__open_resource__() if fd: data = fd.read() fd.close() self.__data = InputSource() self.__data.setByteStream(_StringIO(data))
def parse(xml): """Returns a hierarchy of Element objects parsed from the supplied XML source. The XML source can be a full document or any well-formed XML fragment. In addition, the `xml` parameter can be a file path, a string, a file-like object or a URL. """ from xml.sax.handler import feature_namespaces, feature_namespace_prefixes from xml.sax import make_parser, SAXNotSupportedException, SAXNotRecognizedException import urllib builder = __XMLBuilder() parser = make_parser() # Perform namespace processing parser.setFeature(feature_namespaces, True) # Report original prefixed names (i.e. qname is passed to startElementNS) # if not possible, we have fallback code anyway try: parser.setFeature(feature_namespace_prefixes, True) except SAXNotSupportedException: pass except SAXNotRecognizedException: pass parser.setContentHandler(builder) parser.setErrorHandler(builder) if not hasattr(xml, 'read'): try: xml = open(xml) except: #print 'Could not find a file called %s' % xml try: xml = urllib.urlopen(xml) except: #print 'Could not find a URL called %s' % xml xml = _StringIO(xml) parser.parse(xml) return builder.tree
def _updatezapper (): """ Update filter rules (.zap files). """ global updatelog log = _StringIO() doreload = False try: doreload = _update_filter(config, log=log, dryrun=False) updatelog = log.getvalue() config.write_filterconf() except StandardError: updatelog = log.getvalue() updatelog += _traceback.format_exc() else: if doreload: # pass pass
def __repr__(self, asdoc=False, encoding='UTF8'): """Returns an XML representation of this Element. By default the XML returned is not a valid document. If the `asdoc` parameter supplied is true then the representation is prefaced with an XML version processing instruction so that a valid document is returned. If the `encoding` parameter is supplied, it will be used as the encoding for the XML document returned. """ f = _StringIO() self.write(f, encoding) xmlstring = f.getvalue().strip() r = '%s' if asdoc: r = '<?xml version="1.0" encoding="%s"?>%%s' % encoding return r % xmlstring
def __open_resource__(self): """Opens the resource depends on the type of information given. If it is a file handle, nothing needs to be done; if it is the XML data, make it readable like a file; if it is a filename, open it and return the file handle. Return: A handle to read from by calling the method 'read()' of the handle. """ if hasattr(self.url_file_string, 'read'): return self.url_file_string if self.url_file_string == "-": return sys.stdin if self.url_file_string[0] == "<": return _StringIO(self.url_file_string.encode("utf-8")) try: return open(self.url_file_string) except: pass
def convert(path): """ Run converter """ rsrcmgr = _PDFResourceManager() retstr = _StringIO() device = _TextConverter(rsrcmgr, retstr, codec="utf-8", laparams=_LAParams()) stream = file(path, "rb") interpreter = _PDFPageInterpreter(rsrcmgr, device) try: for page in _PDFPage.get_pages(stream, set(), maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() stream.close() device.close() retstr.close() return text except _PDFTextExtractionNotAllowed: decr_file = "/".join([ _os.path.split(path)[0], _os.path.split(path)[-1].split(".")[0] + "_decrypted.pdf" ]) if _os.path.exists(decr_file): return convert(decr_file) _os.system("qpdf --decrypt %s %s" % (path, decr_file)) return convert(decr_file)
def __init__(self, data=None, mode=None): wrapped_file = _StringIO() if data is not None: wrapped_file.write(data) wrapped_file.seek(0) super(StringIO, self).__init__(wrapped_file, mode)
def _safe_repr(object, context, maxlevels, level): typ = _type(object) if typ is str: if 'locale' not in _sys.modules: return `object`, True, False if "'" in object and '"' not in object: closure = '"' quotes = {'"': '\\"'} else: closure = "'" quotes = {"'": "\\'"} qget = quotes.get sio = _StringIO() write = sio.write for char in object: if char.isalpha(): write(char) else: write(qget(char, `char`[1:-1])) return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False if typ is dict: if not object: return "{}", True, False objid = _id(object) if maxlevels and level > maxlevels: return "{...}", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 saferepr = _safe_repr for k, v in object.iteritems(): krepr, kreadable, krecur = saferepr(k, context, maxlevels, level) vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) append("%s: %s" % (krepr, vrepr)) readable = readable and kreadable and vreadable if krecur or vrecur: recursive = True del context[objid] return "{%s}" % _commajoin(components), readable, recursive if typ is list or typ is tuple: if typ is list: if not object: return "[]", True, False format = "[%s]" elif _len(object) == 1: format = "(%s,)" else: if not object: return "()", True, False format = "(%s)" objid = _id(object) if maxlevels and level > maxlevels: return format % "...", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 for o in object: orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level) append(orepr) if not oreadable: readable = False if orecur: recursive = True del context[objid] return format % _commajoin(components), readable, recursive rep = `object` return rep, (rep and not rep.startswith('<')), False
def _safe_repr(object, context, maxlevels, level): typ = _type(object) if typ is str: if 'locale' not in _sys.modules: return (repr(object), True, False) if "'" in object and '"' not in object: closure = '"' quotes = {'"': '\\"'} else: closure = "'" quotes = {"'": "\\'"} qget = quotes.get sio = _StringIO() write = sio.write for char in object: if char.isalpha(): write(char) else: write(qget(char, repr(char)[1:-1])) return ('%s%s%s' % (closure, sio.getvalue(), closure), True, False) else: r = getattr(typ, '__repr__', None) if issubclass(typ, dict) and r is dict.__repr__: if not object: return ('{}', True, False) objid = _id(object) if maxlevels and level >= maxlevels: return ('{...}', False, objid in context) if objid in context: return (_recursion(object), False, True) context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 saferepr = _safe_repr for k, v in _sorted(object.items()): krepr, kreadable, krecur = saferepr(k, context, maxlevels, level) vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) append('%s: %s' % (krepr, vrepr)) readable = readable and kreadable and vreadable if krecur or vrecur: recursive = True del context[objid] return ('{%s}' % _commajoin(components), readable, recursive) if issubclass(typ, list) and r is list.__repr__ or issubclass(typ, tuple) and r is tuple.__repr__: if issubclass(typ, list): if not object: return ('[]', True, False) format = '[%s]' elif _len(object) == 1: format = '(%s,)' else: if not object: return ('()', True, False) format = '(%s)' objid = _id(object) if maxlevels and level >= maxlevels: return (format % '...', False, objid in context) if objid in context: return (_recursion(object), False, True) context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 for o in object: orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level) append(orepr) if not oreadable: readable = False if orecur: recursive = True del context[objid] return (format % _commajoin(components), readable, recursive) rep = repr(object) return (rep, rep and not rep.startswith('<'), False)
"""Implements (a subset of) Sun XDR -- eXternal Data Representation.
def subsample(self, genome_size=6601757, read_cov_depth=80, pc_loss=0.2, force=False, cov_closeness=5): ''' Given the size in basepairs of a genome sequence, downsample fastq files to a desired average read coverage depth predicted after read alignment. Read lengths are taken from the file. By default, 20% are assumed to be lost at downstream quality control stages (e.g. quality score based trimming). The percent loss is used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent subsampling if within 5x coverage: avoids time consuming subsampling that will only make a small difference. ''' subsampled_read_files = {} start_time = _time.time() for cnum, (pairname, files) in enumerate(self.read_files.items()): processed_path_1 = insert_suffix(files[1], '_subsmp') processed_path_2 = insert_suffix(files[2], '_subsmp') if not all([_os.path.exists(processed_path_1), _os.path.exists(processed_path_2)]) \ or force: if files[1][-2:] == 'gz': fh1 = _gzip.open(files[1]) else: fh1 = open(files[1]) aread = _SeqIO.parse(fh1, 'fastq').next() read_len = len(aread.seq) print('Counting reads in %s' % files[1]) fh1.seek(0) lines = 0 # report per half million reads interval = 2000000 nextreport = interval for line in fh1: lines += 1 if lines == nextreport: print('{:,} reads'.format(lines / 4)) nextreport += interval totalreads = lines / 4.0 print('Found %s reads' % totalreads) full_depth_coverage = read_len * 2 * totalreads * ( 1 - pc_loss) / genome_size print( 'These paired read files would provide approximately {:.1f}x coverage depth' .format(full_depth_coverage)) numreads2keep = int( round( genome_size * read_cov_depth / (read_len * 2) / (1 - pc_loss), 0)) if numreads2keep >= totalreads: print( 'This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.' .format(full_depth_coverage, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue elif full_depth_coverage < read_cov_depth + cov_closeness: print( 'This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.' .format(full_depth_coverage, cov_closeness, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue else: print( 'For approximately {}x read coverage, will retain {} of {} {}bp read pairs' .format(read_cov_depth, numreads2keep, totalreads, read_len)) fh1.seek(0) if files[2][-2:] == 'gz': fh2 = _gzip.open(files[2]) else: fh2 = open(files[2]) fout1 = _gzip.open(processed_path_1, 'wb') fout2 = _gzip.open(processed_path_2, 'wb') batch_size = 200000 keep_per_pop = int( numreads2keep / float(totalreads) * batch_size) + 1 nextwrite = batch_size written = 0 n1 = 0 n2 = 0 these_lines1 = [] these_lines2 = [] reportfreq = 10 thisreport = 0 print('Subsampling . . .') for line in fh1: these_lines1 += [line] if len(these_lines1) % 4 == 0: n1 += 1 if n1 == nextwrite: keep_indices = sorted( _sample(xrange(batch_size), keep_per_pop)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read( _StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) these_lines1 = [] written += keep_per_pop thisreport += 1 if thisreport == reportfreq or written == keep_per_pop: # report first time and at intevals print( 'Written {:,} reads ({:.1%}) to {}'.format( written, written / float(numreads2keep), processed_path_1)) for line2 in fh2: these_lines2 += [line2] if len(these_lines2) % 4 == 0: n2 += 1 if n2 == nextwrite: keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read( _StringIO(''.join(keep_these[:4])), 'fastq') fout2.write(''.join(keep_these)) these_lines2 = [] if thisreport == reportfreq or written == keep_per_pop: thisreport = 0 print( 'Written {:,} reads ({:.1%}) to {}' .format( written, written / float(numreads2keep), processed_path_2)) nextwrite += batch_size break # write remainder remainder = nextwrite - n1 keep_in_remainder = int( keep_per_pop * (remainder / float(batch_size))) + 1 keep_indices = sorted( _sample(xrange(remainder), keep_in_remainder)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) written += keep_in_remainder print('Written {:,} reads ({:.1%}) to {}'.format( written, written / float(numreads2keep), processed_path_1)) # get remainder for line2 in fh2: these_lines2 += [line2] # write remainder keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read( _StringIO(''.join(keep_these[:4])), 'fastq') ###### check why keep_these was empty fout2.write(''.join(keep_these)) print('Written {:,} reads ({:.1%}) to {}'.format( written, written / float(numreads2keep), processed_path_2)) # not sure if this is quicker/slower (more calls to .join()) # this_read = [] # for line in fh1: # this_read += [line] # if len(this_read) == 4: # these_reads1 += [''.join(this_read)] # #these_reads1 += this_read # this_read = [] # n1 += 1 # if n1 == nextwrite: # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop)) # # try parsing a read for QC # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq') # fout1.write(''.join([these_reads1[i] for i in keep_indices])) # these_reads1 = [] # written += keep_per_pop # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_1)) # for line2 in fh2: # this_read += [line2] # if len(this_read) == 4: # these_reads2 += [''.join(this_read)] # this_read = [] # n2 += 1 # if n2 == nextwrite: # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq') # fout2.write(''.join([these_reads2[i] for i in keep_indices])) # these_reads2 = [] # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_2)) # nextwrite += batch_size # break fout1.close() fout2.close() fh1.close() fh2.close() else: print('Found:') print(processed_path_1) print(processed_path_2) print('use "force = True" to overwrite') if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = processed_path_1 subsampled_read_files[pairname][2] = processed_path_2 # replace here as this step is optional self.fullsized_read_files = list(self.read_files) self.read_files = subsampled_read_files
def __init__(self,data=None,mode=None): wrapped_file = _StringIO() if data is not None: wrapped_file.write(data) wrapped_file.seek(0) super(StringIO,self).__init__(wrapped_file,mode)
def pformat(self, object, enable_pickle=None): if enable_pickle is None: enable_pickle = self.enable_pickle sio = _StringIO() self._format(object, sio, 0, 0, {}, 0, enable_pickle) return sio.getvalue()
def _safe_repr(object, context, maxlevels, level): typ = _type(object) if typ is str: if 'locale' not in _sys.modules: return (repr(object), True, False) if "'" in object and '"' not in object: closure = '"' quotes = { '"': '\\"' } else: closure = "'" quotes = { "'": "\\'" } qget = quotes.get sio = _StringIO() write = sio.write for char in object: if char.isalpha(): write(char) continue write(qget(char, repr(char)[1:-1])) return ('%s%s%s' % (closure, sio.getvalue(), closure), True, False) r = getattr(typ, '__repr__', None) if issubclass(typ, dict) and r is dict.__repr__: if not object: return ('{}', True, False) objid = _id(object) if maxlevels and level > maxlevels: return ('{...}', False, objid in context) if objid in context: return (_recursion(object), False, True) context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 saferepr = _safe_repr for (k, v) in object.iteritems(): (krepr, kreadable, krecur) = saferepr(k, context, maxlevels, level) (vrepr, vreadable, vrecur) = saferepr(v, context, maxlevels, level) append('%s: %s' % (krepr, vrepr)) if readable and kreadable: pass readable = vreadable if krecur or vrecur: recursive = True continue del context[objid] return ('{%s}' % _commajoin(components), readable, recursive) if (issubclass(typ, list) or r is list.__repr__ or issubclass(typ, tuple)) and r is tuple.__repr__: if issubclass(typ, list): if not object: return ('[]', True, False) format = '[%s]' elif _len(object) == 1: format = '(%s,)' elif not object: return ('()', True, False) format = '(%s)' objid = _id(object) if maxlevels and level > maxlevels: return (format % '...', False, objid in context) if objid in context: return (_recursion(object), False, True) context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 for o in object: (orepr, oreadable, orecur) = _safe_repr(o, context, maxlevels, level) append(orepr) if not oreadable: readable = False if orecur: recursive = True continue del context[objid] return (format % _commajoin(components), readable, recursive) rep = repr(object) if rep: pass return (rep, not rep.startswith('<'), False)
def StringIO(newline): #3-- return _StringIO() #3--
def pformat(self, object): sio = _StringIO() self._format(object, sio, 0, 0, {}, 0) return sio.getvalue()
def __init__(self, basename): self.basename = basename self._file = _StringIO() pyfs_add(self.basename, self._do_open, self._do_read)
def _safe_repr(object, context, maxlevels, level, enable_pickle=None): if enable_pickle is None: enable_pickle = ENABLE_PICKLE_DEFAULT typ = _type(object) if typ is str: if 'locale' not in _sys.modules: return repr(object), True, False if "'" in object and '"' not in object: closure = '"' quotes = {'"': '\\"'} else: closure = "'" quotes = {"'": "\\'"} qget = quotes.get sio = _StringIO() write = sio.write for char in object: if char.isalpha(): write(char) else: write(qget(char, repr(char)[1:-1])) return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False r = getattr(typ, "__repr__", None) if issubclass(typ, dict) and r is dict.__repr__: if not object: return "{}", True, False objid = _id(object) if maxlevels and level >= maxlevels: return "{...}", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 saferepr = _safe_repr for k, v in _sorted(object.items()): krepr, kreadable, krecur = saferepr(k, context, maxlevels, level) vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) append("%s: %s" % (krepr, vrepr)) readable = readable and kreadable and vreadable if krecur or vrecur: recursive = True del context[objid] return "{%s}" % _commajoin(components), readable, recursive if (issubclass(typ, list) and r is list.__repr__) or \ (issubclass(typ, tuple) and r is tuple.__repr__): if issubclass(typ, list): if not object: return "[]", True, False format = "[%s]" elif _len(object) == 1: format = "(%s,)" else: if not object: return "()", True, False format = "(%s)" objid = _id(object) if maxlevels and level >= maxlevels: return format % "...", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 for o in object: orepr, oreadable, orecur = _safe_repr( o, context, maxlevels, level, enable_pickle) append(orepr) if not oreadable: readable = False if orecur: recursive = True del context[objid] return format % _commajoin(components), readable, recursive ## Use pickle data if enable_pickle and _pickleable(typ, object): reduce_data = object.__reduce__() constructor, args = reduce_data constructor = constructor.__name__ if not args: return constructor+"()", True, False objid = _id(object) if maxlevels and level >= maxlevels: return constructor+"(...)", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 saferepr = _safe_repr for arg in args: arepr, areadable, arecur = saferepr(arg, context, maxlevels, level) append(arepr) readable = readable and areadable if arecur: recursive = True del context[objid] return constructor + "(%s)" % _commajoin(components), readable, recursive rep = repr(object) return rep, (rep and not rep.startswith('<')), False
def reset(self): self.__buf = _StringIO()
def _safe_repr(object, context, maxlevels, level): typ = _type(object) if typ is str: if 'locale' not in _sys.modules: return repr(object), True, False if "'" in object and '"' not in object: closure = '"' quotes = {'"': '\\"'} else: closure = "'" quotes = {"'": "\\'"} qget = quotes.get sio = _StringIO() write = sio.write for char in object: if char.isalpha(): write(char) else: write(qget(char, repr(char)[1:-1])) return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False r = getattr(typ, "__repr__", None) if issubclass(typ, dict) and r == dict.__repr__: if not object: return "{}", True, False objid = _id(object) if maxlevels and level >= maxlevels: return "{...}", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 saferepr = _safe_repr for k, v in _sorted(object.items()): krepr, kreadable, krecur = saferepr(k, context, maxlevels, level) vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) append("%s: %s" % (krepr, vrepr)) readable = readable and kreadable and vreadable if krecur or vrecur: recursive = True del context[objid] return "{%s}" % _commajoin(components), readable, recursive if (issubclass(typ, list) and r == list.__repr__) or \ (issubclass(typ, tuple) and r == tuple.__repr__): if issubclass(typ, list): if not object: return "[]", True, False format = "[%s]" elif _len(object) == 1: format = "(%s,)" else: if not object: return "()", True, False format = "(%s)" objid = _id(object) if maxlevels and level >= maxlevels: return format % "...", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 for o in object: orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level) append(orepr) if not oreadable: readable = False if orecur: recursive = True del context[objid] return format % _commajoin(components), readable, recursive rep = repr(object) return rep, (rep and not rep.startswith('<')), False
def subsample(self, genome_size = 6601757, read_cov_depth = 80, pc_loss = 0.2, force = False, cov_closeness = 5): ''' Given the size in basepairs of a genome sequence, downsample fastq files to a desired average read coverage depth predicted after read alignment. Read lengths are taken from the file. By default, 20% are assumed to be lost at downstream quality control stages (e.g. quality score based trimming). The percent loss is used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent subsampling if within 5x coverage: avoids time consuming subsampling that will only make a small difference. ''' subsampled_read_files = {} start_time = _time.time() for cnum,(pairname,files) in enumerate(self.read_files.items()): processed_path_1 = insert_suffix(files[1], '_subsmp') processed_path_2 = insert_suffix(files[2], '_subsmp') if not all([_os.path.exists(processed_path_1), _os.path.exists(processed_path_2)]) \ or force: if files[1][-2:] == 'gz': fh1 = _gzip.open(files[1]) else: fh1 = open(files[1]) aread = _SeqIO.parse(fh1, 'fastq').next() read_len = len(aread.seq) print('Counting reads in %s' % files[1]) fh1.seek(0) lines = 0 # report per half million reads interval = 2000000 nextreport = interval for line in fh1: lines += 1 if lines == nextreport: print('{:,} reads'.format(lines/4)) nextreport += interval totalreads = lines / 4.0 print('Found %s reads' % totalreads) full_depth_coverage = read_len * 2 * totalreads * (1 - pc_loss) / genome_size print('These paired read files would provide approximately {:.1f}x coverage depth'.format(full_depth_coverage)) numreads2keep = int( round(genome_size * read_cov_depth / (read_len * 2) / (1 - pc_loss), 0) ) if numreads2keep >= totalreads: print('This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.'.format(full_depth_coverage, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue elif full_depth_coverage < read_cov_depth + cov_closeness: print('This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.'.format(full_depth_coverage, cov_closeness, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue else: print('For approximately {}x read coverage, will retain {} of {} {}bp read pairs'.format( read_cov_depth, numreads2keep, totalreads, read_len)) fh1.seek(0) if files[2][-2:] == 'gz': fh2 = _gzip.open(files[2]) else: fh2 = open(files[2]) fout1 = _gzip.open(processed_path_1, 'wb') fout2 = _gzip.open(processed_path_2, 'wb') batch_size = 200000 keep_per_pop = int(numreads2keep / float(totalreads) * batch_size) + 1 nextwrite = batch_size written = 0 n1 = 0 n2 = 0 these_lines1 = [] these_lines2 = [] reportfreq = 10 thisreport = 0 print('Subsampling . . .') for line in fh1: these_lines1 += [line] if len(these_lines1) % 4 == 0: n1 += 1 if n1 == nextwrite: keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) these_lines1 = [] written += keep_per_pop thisreport += 1 if thisreport == reportfreq or written == keep_per_pop: # report first time and at intevals print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_1)) for line2 in fh2: these_lines2 += [line2] if len(these_lines2) % 4 == 0: n2 += 1 if n2 == nextwrite: keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout2.write(''.join(keep_these)) these_lines2 = [] if thisreport == reportfreq or written == keep_per_pop: thisreport = 0 print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_2)) nextwrite += batch_size break # write remainder remainder = nextwrite - n1 keep_in_remainder = int(keep_per_pop * (remainder / float(batch_size))) + 1 keep_indices = sorted(_sample(xrange(remainder), keep_in_remainder)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) written += keep_in_remainder print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_1)) # get remainder for line2 in fh2: these_lines2 += [line2] # write remainder keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') ###### check why keep_these was empty fout2.write(''.join(keep_these)) print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_2)) # not sure if this is quicker/slower (more calls to .join()) # this_read = [] # for line in fh1: # this_read += [line] # if len(this_read) == 4: # these_reads1 += [''.join(this_read)] # #these_reads1 += this_read # this_read = [] # n1 += 1 # if n1 == nextwrite: # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop)) # # try parsing a read for QC # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq') # fout1.write(''.join([these_reads1[i] for i in keep_indices])) # these_reads1 = [] # written += keep_per_pop # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_1)) # for line2 in fh2: # this_read += [line2] # if len(this_read) == 4: # these_reads2 += [''.join(this_read)] # this_read = [] # n2 += 1 # if n2 == nextwrite: # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq') # fout2.write(''.join([these_reads2[i] for i in keep_indices])) # these_reads2 = [] # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_2)) # nextwrite += batch_size # break fout1.close() fout2.close() fh1.close() fh2.close() else: print('Found:') print(processed_path_1) print(processed_path_2) print('use "force = True" to overwrite') if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = processed_path_1 subsampled_read_files[pairname][2] = processed_path_2 # replace here as this step is optional self.fullsized_read_files = list(self.read_files) self.read_files = subsampled_read_files
def _safe_repr(object, context, maxlevels, level): typ = _type(object) if typ is str: string = object string = string.replace('\n', '\\n').replace('\r','\\r').replace('\t','\\t') if 'locale' not in _sys.modules: return repr(object), True, False if "'" in object and '"' not in object: closure = '"' quotes = {'"': '\\"'} string = string.replace('"','\\"') else: closure = "'" quotes = {"'": "\\'"} string = string.replace("'", "\\'") try: string.decode('utf8').encode('gbk', 'replace') return ("%s%s%s" % (closure, string, closure)), True, False except: pass qget = quotes.get sio = _StringIO() write = sio.write for char in object: if char.isalpha(): write(char) else: write(qget(char, repr(char)[1:-1])) return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False if typ is unicode: string = object.encode("utf8", 'replace') string = string.replace('\n', '\\n').replace('\r','\\r').replace('\t','\\t') if "'" in object and '"' not in object: closure = '"' quotes = {'"': '\\"'} string = string.replace('"','\\"') else: closure = "'" quotes = {"'": "\\'"} string = string.replace("'", "\\'") return ("u%s%s%s" % (closure, string, closure)), True, False r = getattr(typ, "__repr__", None) if issubclass(typ, dict) and r is dict.__repr__: if not object: return "{}", True, False objid = _id(object) if maxlevels and level >= maxlevels: return "{...}", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 saferepr = _safe_repr for k, v in _sorted(object.items()): krepr, kreadable, krecur = saferepr(k, context, maxlevels, level) vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) append("%s: %s" % (krepr, vrepr)) readable = readable and kreadable and vreadable if krecur or vrecur: recursive = True del context[objid] return "{%s}" % _commajoin(components), readable, recursive if (issubclass(typ, list) and r is list.__repr__) or \ (issubclass(typ, tuple) and r is tuple.__repr__): if issubclass(typ, list): if not object: return "[]", True, False format = "[%s]" elif _len(object) == 1: format = "(%s,)" else: if not object: return "()", True, False format = "(%s)" objid = _id(object) if maxlevels and level >= maxlevels: return format % "...", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 for o in object: orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level) append(orepr) if not oreadable: readable = False if orecur: recursive = True del context[objid] return format % _commajoin(components), readable, recursive rep = repr(object) return rep, (rep and not rep.startswith('<')), False