Ejemplo n.º 1
0
def _open_resource(url_file_stream_or_string):
    """URL, filename, or string --> stream

    This function lets you define parsers that take any input source
    (URL, pathname to local or network file, or actual data as a string)
    and deal with it in a uniform manner.  Returned object is guaranteed
    to have all the basic stdio read methods (read, readline, readlines).
    Just .close() the object when you're done with it.
    """

    if hasattr(url_file_stream_or_string, 'read'):
        return url_file_stream_or_string

    if url_file_stream_or_string == '-':
        return sys.stdin

    if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
        request = urllib2.Request(url_file_stream_or_string)
        request.add_header('A-IM', 'feed') # RFC 3229 support
        opener = urllib2.build_opener()
        opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
        try:
            return opener.open(request)
        finally:
            opener.close() # JohnD
    
    # try to open with native open function (if url_file_stream_or_string is a filename)
    try:
        return open(url_file_stream_or_string)
    except:
        pass

    # treat url_file_stream_or_string as string
    return _StringIO(str(url_file_stream_or_string))
Ejemplo n.º 2
0
	def build_pdf(self):
		buf = _StringIO()
		sdoc = _SimpleDocTemplate(filename=buf, **self.config)
		if isinstance(self.elements[-1], _Spacer):
			del self.elements[-1]
		sdoc.build(self.elements, onFirstPage=self.__FirtPage, onLaterPages=self.__LaterPages)
		return buf.getvalue()
Ejemplo n.º 3
0
 def pformat(self, object):
     fs = PrettyPrinter.pformat(self, object)
     rs = _StringIO()
     for line in fs.split('\n'):
         if len(line) != 0:
             rs.write(self.__per_line_indent_str)
             rs.write(line)
         rs.write('\n')
     return rs.getvalue()
Ejemplo n.º 4
0
 def __init__(self, max_size = 0, mode = 'w+b', bufsize = -1, suffix = '', prefix = template, dir = None):
     self._file = _StringIO()
     self._max_size = max_size
     self._rolled = False
     self._TemporaryFileArgs = (mode,
      bufsize,
      suffix,
      prefix,
      dir)
Ejemplo n.º 5
0
 def build_pdf(self):
     buf = _StringIO()
     sdoc = _SimpleDocTemplate(filename=buf, **self.config)
     if isinstance(self.elements[-1], _Spacer):
         del self.elements[-1]
     sdoc.build(self.elements,
                onFirstPage=self.__FirtPage,
                onLaterPages=self.__LaterPages)
     return buf.getvalue()
Ejemplo n.º 6
0
 def __init__(self, max_size = 0, mode = 'w+b', bufsize = -1, suffix = '', prefix = template, dir = None):
     self._file = _StringIO()
     self._max_size = max_size
     self._rolled = False
     self._TemporaryFileArgs = (mode,
      bufsize,
      suffix,
      prefix,
      dir)
Ejemplo n.º 7
0
 def pformat(self, object):
     fs = PrettyPrinter.pformat(self, object)
     rs = _StringIO()
     for line in fs.split('\n'):
         if len(line) != 0:
             rs.write(self.__per_line_indent_str)
             rs.write(line)
         rs.write('\n')
     return rs.getvalue()
Ejemplo n.º 8
0
 def __str__(self):
     sf = _StringIO()
     self.dump(sf)
     rs = sf.getvalue()
     # turn into single line for this (yes, cheesy, but it kind of works out)
     fl = []
     for line in rs.split('\n'):
         fl.append(line.strip())
     ft = ' '.join(fl)
     return ft
Ejemplo n.º 9
0
 def __str__(self):
     sf = _StringIO()
     self.dump(sf)
     rs = sf.getvalue()
     # turn into single line for this (yes, cheesy, but it kind of works out)
     fl = []
     for line in rs.split('\n'):
         fl.append(line.strip())
     ft = ' '.join(fl)
     return ft
Ejemplo n.º 10
0
    def on_finished(self):
        info = self._make_callback_info()
        self.last_url = self.handle.getinfo(pycurl.EFFECTIVE_URL)
        if self.options.write_file is None:
            info['body'] = self.buffer.getvalue()

            if gzip and info.get('content-encoding', '') == 'gzip':
                try:
                    info['body'] = gzip.GzipFile(
                                       fileobj=_StringIO(info['body'])).read()
                except IOError, e:
                    logging.warning("Received header with content-encoding "
                                    "gzip, but content is not gzip encoded")
Ejemplo n.º 11
0
def _updaterating ():
    """
    Update ratings.
    """
    global updatelog
    log = _StringIO()
    try:
        doreload = _update_ratings(config, log=log, dryrun=False)
        updatelog = log.getvalue()
        # XXX
    except StandardError:
        updatelog = log.getvalue()
        updatelog += _traceback.format_exc()
Ejemplo n.º 12
0
    def __read_data__(self):
        """Reads the XML document.
        """

        if self.__data:
            return

        fd = self.__open_resource__()
        if fd:
            data = fd.read()
            fd.close()

        self.__data = InputSource()
        self.__data.setByteStream(_StringIO(data))
Ejemplo n.º 13
0
    def __read_data__(self):
        """Reads the XML document.
        """

        if self.__data:
            return

        fd = self.__open_resource__()
        if fd:
            data = fd.read()
            fd.close()

        self.__data = InputSource()
        self.__data.setByteStream(_StringIO(data))
Ejemplo n.º 14
0
def parse(xml):
    """Returns a hierarchy of Element objects parsed from
	the supplied XML source.
	
	The XML source can be a full document or any well-formed
	XML fragment.
	
	In addition, the `xml` parameter can be a file path, a string, a file-like object
	or a URL.
	"""

    from xml.sax.handler import feature_namespaces, feature_namespace_prefixes
    from xml.sax import make_parser, SAXNotSupportedException, SAXNotRecognizedException

    import urllib

    builder = __XMLBuilder()

    parser = make_parser()

    # Perform namespace processing
    parser.setFeature(feature_namespaces, True)

    # Report original prefixed names (i.e. qname is passed to startElementNS)
    # if not possible, we have fallback code anyway
    try:
        parser.setFeature(feature_namespace_prefixes, True)
    except SAXNotSupportedException:
        pass
    except SAXNotRecognizedException:
        pass

    parser.setContentHandler(builder)
    parser.setErrorHandler(builder)

    if not hasattr(xml, 'read'):
        try:
            xml = open(xml)
        except:
            #print 'Could not find a file called %s' % xml
            try:
                xml = urllib.urlopen(xml)
            except:
                #print 'Could not find a URL called %s' % xml
                xml = _StringIO(xml)

    parser.parse(xml)

    return builder.tree
Ejemplo n.º 15
0
def _updatezapper ():
    """
    Update filter rules (.zap files).
    """
    global updatelog
    log = _StringIO()
    doreload = False
    try:
        doreload = _update_filter(config, log=log, dryrun=False)
        updatelog = log.getvalue()
        config.write_filterconf()
    except StandardError:
        updatelog = log.getvalue()
        updatelog += _traceback.format_exc()
    else:
        if doreload:
            # pass
            pass
Ejemplo n.º 16
0
    def __repr__(self, asdoc=False, encoding='UTF8'):
        """Returns an XML representation of this Element.
		
		By default the XML returned is not a valid document. If the `asdoc` parameter
		supplied is true then the representation is prefaced with an XML version
		processing instruction so that a valid document is returned.
		
		If the `encoding` parameter is supplied, it will be used as the encoding
		for the XML document returned.
		"""

        f = _StringIO()
        self.write(f, encoding)
        xmlstring = f.getvalue().strip()
        r = '%s'

        if asdoc:
            r = '<?xml version="1.0" encoding="%s"?>%%s' % encoding

        return r % xmlstring
Ejemplo n.º 17
0
    def __open_resource__(self):
        """Opens the resource depends on the type of information given.

        If it is a file handle, nothing needs to be done; if it is the XML data,
        make it readable like a file; if it is a filename, open it and return
        the file handle.

        Return: A handle to read from by calling the method 'read()' of the
        handle.
        """

        if hasattr(self.url_file_string, 'read'):
            return self.url_file_string

        if self.url_file_string == "-":
            return sys.stdin

        if self.url_file_string[0] == "<":
            return _StringIO(self.url_file_string.encode("utf-8"))

        try:
            return open(self.url_file_string)
        except:
            pass
Ejemplo n.º 18
0
def convert(path):
    """
    Run converter
    """
    rsrcmgr = _PDFResourceManager()
    retstr = _StringIO()
    device = _TextConverter(rsrcmgr,
                            retstr,
                            codec="utf-8",
                            laparams=_LAParams())
    stream = file(path, "rb")
    interpreter = _PDFPageInterpreter(rsrcmgr, device)

    try:
        for page in _PDFPage.get_pages(stream,
                                       set(),
                                       maxpages=0,
                                       password="",
                                       caching=True,
                                       check_extractable=True):
            interpreter.process_page(page)
        text = retstr.getvalue()

        stream.close()
        device.close()
        retstr.close()
        return text
    except _PDFTextExtractionNotAllowed:
        decr_file = "/".join([
            _os.path.split(path)[0],
            _os.path.split(path)[-1].split(".")[0] + "_decrypted.pdf"
        ])
        if _os.path.exists(decr_file):
            return convert(decr_file)
        _os.system("qpdf --decrypt %s %s" % (path, decr_file))
        return convert(decr_file)
Ejemplo n.º 19
0
    def __open_resource__(self):
        """Opens the resource depends on the type of information given.

        If it is a file handle, nothing needs to be done; if it is the XML data,
        make it readable like a file; if it is a filename, open it and return
        the file handle.

        Return: A handle to read from by calling the method 'read()' of the
        handle.
        """

        if hasattr(self.url_file_string, 'read'):
            return self.url_file_string

        if self.url_file_string == "-":
            return sys.stdin

        if self.url_file_string[0] == "<":
            return _StringIO(self.url_file_string.encode("utf-8"))

        try:
            return open(self.url_file_string)
        except:
            pass
Ejemplo n.º 20
0
 def __init__(self, data=None, mode=None):
     wrapped_file = _StringIO()
     if data is not None:
         wrapped_file.write(data)
         wrapped_file.seek(0)
     super(StringIO, self).__init__(wrapped_file, mode)
Ejemplo n.º 21
0
def _safe_repr(object, context, maxlevels, level):
    typ = _type(object)
    if typ is str:
        if 'locale' not in _sys.modules:
            return `object`, True, False
        if "'" in object and '"' not in object:
            closure = '"'
            quotes = {'"': '\\"'}
        else:
            closure = "'"
            quotes = {"'": "\\'"}
        qget = quotes.get
        sio = _StringIO()
        write = sio.write
        for char in object:
            if char.isalpha():
                write(char)
            else:
                write(qget(char, `char`[1:-1]))
        return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False

    if typ is dict:
        if not object:
            return "{}", True, False
        objid = _id(object)
        if maxlevels and level > maxlevels:
            return "{...}", False, objid in context
        if objid in context:
            return _recursion(object), False, True
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        saferepr = _safe_repr
        for k, v in object.iteritems():
            krepr, kreadable, krecur = saferepr(k, context, maxlevels, level)
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
            append("%s: %s" % (krepr, vrepr))
            readable = readable and kreadable and vreadable
            if krecur or vrecur:
                recursive = True
        del context[objid]
        return "{%s}" % _commajoin(components), readable, recursive

    if typ is list or typ is tuple:
        if typ is list:
            if not object:
                return "[]", True, False
            format = "[%s]"
        elif _len(object) == 1:
            format = "(%s,)"
        else:
            if not object:
                return "()", True, False
            format = "(%s)"
        objid = _id(object)
        if maxlevels and level > maxlevels:
            return format % "...", False, objid in context
        if objid in context:
            return _recursion(object), False, True
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        for o in object:
            orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level)
            append(orepr)
            if not oreadable:
                readable = False
            if orecur:
                recursive = True
        del context[objid]
        return format % _commajoin(components), readable, recursive

    rep = `object`
    return rep, (rep and not rep.startswith('<')), False
Ejemplo n.º 22
0
def _safe_repr(object, context, maxlevels, level):
    typ = _type(object)
    if typ is str:
        if 'locale' not in _sys.modules:
            return (repr(object), True, False)
        if "'" in object and '"' not in object:
            closure = '"'
            quotes = {'"': '\\"'}
        else:
            closure = "'"
            quotes = {"'": "\\'"}
        qget = quotes.get
        sio = _StringIO()
        write = sio.write
        for char in object:
            if char.isalpha():
                write(char)
            else:
                write(qget(char, repr(char)[1:-1]))

        return ('%s%s%s' % (closure, sio.getvalue(), closure), True, False)
    else:
        r = getattr(typ, '__repr__', None)
        if issubclass(typ, dict) and r is dict.__repr__:
            if not object:
                return ('{}', True, False)
            objid = _id(object)
            if maxlevels and level >= maxlevels:
                return ('{...}', False, objid in context)
            if objid in context:
                return (_recursion(object), False, True)
            context[objid] = 1
            readable = True
            recursive = False
            components = []
            append = components.append
            level += 1
            saferepr = _safe_repr
            for k, v in _sorted(object.items()):
                krepr, kreadable, krecur = saferepr(k, context, maxlevels, level)
                vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
                append('%s: %s' % (krepr, vrepr))
                readable = readable and kreadable and vreadable
                if krecur or vrecur:
                    recursive = True

            del context[objid]
            return ('{%s}' % _commajoin(components), readable, recursive)
        if issubclass(typ, list) and r is list.__repr__ or issubclass(typ, tuple) and r is tuple.__repr__:
            if issubclass(typ, list):
                if not object:
                    return ('[]', True, False)
                format = '[%s]'
            elif _len(object) == 1:
                format = '(%s,)'
            else:
                if not object:
                    return ('()', True, False)
                format = '(%s)'
            objid = _id(object)
            if maxlevels and level >= maxlevels:
                return (format % '...', False, objid in context)
            if objid in context:
                return (_recursion(object), False, True)
            context[objid] = 1
            readable = True
            recursive = False
            components = []
            append = components.append
            level += 1
            for o in object:
                orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level)
                append(orepr)
                if not oreadable:
                    readable = False
                if orecur:
                    recursive = True

            del context[objid]
            return (format % _commajoin(components), readable, recursive)
        rep = repr(object)
        return (rep, rep and not rep.startswith('<'), False)
Ejemplo n.º 23
0
"""Implements (a subset of) Sun XDR -- eXternal Data Representation.
Ejemplo n.º 24
0
    def subsample(self,
                  genome_size=6601757,
                  read_cov_depth=80,
                  pc_loss=0.2,
                  force=False,
                  cov_closeness=5):
        '''
        Given the size in basepairs of a genome sequence, downsample fastq files to a 
        desired average read coverage depth predicted after read alignment. Read lengths
        are taken from the file. By default, 20% are assumed to be lost at downstream 
        quality control stages (e.g. quality score based trimming). The percent loss is 
        used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent
        subsampling if within 5x coverage: avoids time consuming subsampling that will only 
        make a small difference.
        '''

        subsampled_read_files = {}
        start_time = _time.time()
        for cnum, (pairname, files) in enumerate(self.read_files.items()):

            processed_path_1 = insert_suffix(files[1], '_subsmp')
            processed_path_2 = insert_suffix(files[2], '_subsmp')

            if not all([_os.path.exists(processed_path_1),
                        _os.path.exists(processed_path_2)]) \
                    or force:

                if files[1][-2:] == 'gz':
                    fh1 = _gzip.open(files[1])
                else:
                    fh1 = open(files[1])

                aread = _SeqIO.parse(fh1, 'fastq').next()
                read_len = len(aread.seq)

                print('Counting reads in %s' % files[1])
                fh1.seek(0)
                lines = 0
                # report per half million reads
                interval = 2000000
                nextreport = interval
                for line in fh1:
                    lines += 1
                    if lines == nextreport:
                        print('{:,} reads'.format(lines / 4))
                        nextreport += interval

                totalreads = lines / 4.0
                print('Found %s reads' % totalreads)
                full_depth_coverage = read_len * 2 * totalreads * (
                    1 - pc_loss) / genome_size
                print(
                    'These paired read files would provide approximately {:.1f}x coverage depth'
                    .format(full_depth_coverage))
                numreads2keep = int(
                    round(
                        genome_size * read_cov_depth / (read_len * 2) /
                        (1 - pc_loss), 0))

                if numreads2keep >= totalreads:
                    print(
                        'This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.'
                        .format(full_depth_coverage, read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))

                    continue
                elif full_depth_coverage < read_cov_depth + cov_closeness:
                    print(
                        'This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.'
                        .format(full_depth_coverage, cov_closeness,
                                read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))

                    continue
                else:
                    print(
                        'For approximately {}x read coverage, will retain {} of {} {}bp read pairs'
                        .format(read_cov_depth, numreads2keep, totalreads,
                                read_len))

                    fh1.seek(0)
                    if files[2][-2:] == 'gz':
                        fh2 = _gzip.open(files[2])
                    else:
                        fh2 = open(files[2])

                    fout1 = _gzip.open(processed_path_1, 'wb')
                    fout2 = _gzip.open(processed_path_2, 'wb')

                    batch_size = 200000
                    keep_per_pop = int(
                        numreads2keep / float(totalreads) * batch_size) + 1
                    nextwrite = batch_size
                    written = 0
                    n1 = 0
                    n2 = 0
                    these_lines1 = []
                    these_lines2 = []
                    reportfreq = 10
                    thisreport = 0
                    print('Subsampling . . .')
                    for line in fh1:
                        these_lines1 += [line]
                        if len(these_lines1) % 4 == 0:
                            n1 += 1

                        if n1 == nextwrite:
                            keep_indices = sorted(
                                _sample(xrange(batch_size), keep_per_pop))
                            keep_these = []
                            for i in keep_indices:
                                i1 = i * 4
                                i2 = i * 4 + 4
                                keep_these += these_lines1[i1:i2]

                            # try parsing a read for QC
                            assert _SeqIO.read(
                                _StringIO(''.join(keep_these[:4])), 'fastq')
                            fout1.write(''.join(keep_these))
                            these_lines1 = []
                            written += keep_per_pop
                            thisreport += 1
                            if thisreport == reportfreq or written == keep_per_pop:
                                # report first time and at intevals
                                print(
                                    'Written {:,} reads ({:.1%}) to {}'.format(
                                        written,
                                        written / float(numreads2keep),
                                        processed_path_1))

                            for line2 in fh2:
                                these_lines2 += [line2]
                                if len(these_lines2) % 4 == 0:
                                    n2 += 1

                                if n2 == nextwrite:
                                    keep_these = []
                                    for i in keep_indices:
                                        i1 = i * 4
                                        i2 = i * 4 + 4
                                        keep_these += these_lines2[i1:i2]

                                    assert _SeqIO.read(
                                        _StringIO(''.join(keep_these[:4])),
                                        'fastq')
                                    fout2.write(''.join(keep_these))
                                    these_lines2 = []
                                    if thisreport == reportfreq or written == keep_per_pop:
                                        thisreport = 0
                                        print(
                                            'Written {:,} reads ({:.1%}) to {}'
                                            .format(
                                                written,
                                                written / float(numreads2keep),
                                                processed_path_2))
                                    nextwrite += batch_size
                                    break

                    # write remainder
                    remainder = nextwrite - n1
                    keep_in_remainder = int(
                        keep_per_pop * (remainder / float(batch_size))) + 1
                    keep_indices = sorted(
                        _sample(xrange(remainder), keep_in_remainder))
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines1[i1:i2]

                    # try parsing a read for QC
                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])),
                                       'fastq')
                    fout1.write(''.join(keep_these))
                    written += keep_in_remainder
                    print('Written {:,} reads ({:.1%}) to {}'.format(
                        written, written / float(numreads2keep),
                        processed_path_1))

                    # get remainder
                    for line2 in fh2:
                        these_lines2 += [line2]

                    # write remainder
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines2[i1:i2]

                    assert _SeqIO.read(
                        _StringIO(''.join(keep_these[:4])),
                        'fastq')  ###### check why keep_these was empty
                    fout2.write(''.join(keep_these))
                    print('Written {:,} reads ({:.1%}) to {}'.format(
                        written, written / float(numreads2keep),
                        processed_path_2))

                    # not sure if this is quicker/slower (more calls to .join())
                    # this_read = []
                    # for line in fh1:
                    # this_read += [line]
                    # if len(this_read) == 4:
                    # these_reads1 += [''.join(this_read)]
                    # #these_reads1 += this_read
                    # this_read = []
                    # n1 += 1

                    # if n1 == nextwrite:
                    # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
                    # # try parsing a read for QC
                    # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq')
                    # fout1.write(''.join([these_reads1[i] for i in keep_indices]))
                    # these_reads1 = []
                    # written += keep_per_pop
                    # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                    # written/float(numreads2keep),
                    # processed_path_1))
                    # for line2 in fh2:
                    # this_read += [line2]
                    # if len(this_read) == 4:
                    # these_reads2 += [''.join(this_read)]
                    # this_read = []
                    # n2 += 1

                    # if n2 == nextwrite:
                    # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq')
                    # fout2.write(''.join([these_reads2[i] for i in keep_indices]))
                    # these_reads2 = []
                    # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                    # written/float(numreads2keep),
                    # processed_path_2))
                    # nextwrite += batch_size
                    # break

                    fout1.close()
                    fout2.close()
                    fh1.close()
                    fh2.close()

            else:
                print('Found:')
                print(processed_path_1)
                print(processed_path_2)
                print('use "force = True" to overwrite')

            if len(self.read_files) > 1:
                # report durations, time left etc
                _report_time(start_time, cnum, len(self.read_files))

            subsampled_read_files[pairname] = {}
            subsampled_read_files[pairname][1] = processed_path_1
            subsampled_read_files[pairname][2] = processed_path_2

        # replace here as this step is optional
        self.fullsized_read_files = list(self.read_files)
        self.read_files = subsampled_read_files
Ejemplo n.º 25
0
"""Implements (a subset of) Sun XDR -- eXternal Data Representation.
Ejemplo n.º 26
0
 def __init__(self,data=None,mode=None):
     wrapped_file = _StringIO()
     if data is not None:
         wrapped_file.write(data)
         wrapped_file.seek(0)
     super(StringIO,self).__init__(wrapped_file,mode)
Ejemplo n.º 27
0
 def pformat(self, object, enable_pickle=None):
     if enable_pickle is None:
         enable_pickle = self.enable_pickle
     sio = _StringIO()
     self._format(object, sio, 0, 0, {}, 0, enable_pickle)
     return sio.getvalue()
Ejemplo n.º 28
0
def _safe_repr(object, context, maxlevels, level):
    typ = _type(object)
    if typ is str:
        if 'locale' not in _sys.modules:
            return (repr(object), True, False)
        
        if "'" in object and '"' not in object:
            closure = '"'
            quotes = {
                '"': '\\"' }
        else:
            closure = "'"
            quotes = {
                "'": "\\'" }
        qget = quotes.get
        sio = _StringIO()
        write = sio.write
        for char in object:
            if char.isalpha():
                write(char)
                continue
            write(qget(char, repr(char)[1:-1]))
        
        return ('%s%s%s' % (closure, sio.getvalue(), closure), True, False)
    
    r = getattr(typ, '__repr__', None)
    if issubclass(typ, dict) and r is dict.__repr__:
        if not object:
            return ('{}', True, False)
        
        objid = _id(object)
        if maxlevels and level > maxlevels:
            return ('{...}', False, objid in context)
        
        if objid in context:
            return (_recursion(object), False, True)
        
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        saferepr = _safe_repr
        for (k, v) in object.iteritems():
            (krepr, kreadable, krecur) = saferepr(k, context, maxlevels, level)
            (vrepr, vreadable, vrecur) = saferepr(v, context, maxlevels, level)
            append('%s: %s' % (krepr, vrepr))
            if readable and kreadable:
                pass
            readable = vreadable
            if krecur or vrecur:
                recursive = True
                continue
        
        del context[objid]
        return ('{%s}' % _commajoin(components), readable, recursive)
    
    if (issubclass(typ, list) or r is list.__repr__ or issubclass(typ, tuple)) and r is tuple.__repr__:
        if issubclass(typ, list):
            if not object:
                return ('[]', True, False)
            
            format = '[%s]'
        elif _len(object) == 1:
            format = '(%s,)'
        elif not object:
            return ('()', True, False)
        
        format = '(%s)'
        objid = _id(object)
        if maxlevels and level > maxlevels:
            return (format % '...', False, objid in context)
        
        if objid in context:
            return (_recursion(object), False, True)
        
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        for o in object:
            (orepr, oreadable, orecur) = _safe_repr(o, context, maxlevels, level)
            append(orepr)
            if not oreadable:
                readable = False
            
            if orecur:
                recursive = True
                continue
        
        del context[objid]
        return (format % _commajoin(components), readable, recursive)
    
    rep = repr(object)
    if rep:
        pass
    return (rep, not rep.startswith('<'), False)
Ejemplo n.º 29
0
 def StringIO(newline):                                                       #3--
     return _StringIO()                                                       #3--
Ejemplo n.º 30
0
 def pformat(self, object): 
     sio = _StringIO() 
     self._format(object, sio, 0, 0, {}, 0) 
     return sio.getvalue() 
Ejemplo n.º 31
0
 def pformat(self, object):
     sio = _StringIO()
     self._format(object, sio, 0, 0, {}, 0)
     return sio.getvalue()
Ejemplo n.º 32
0
Archivo: pyfs.py Proyecto: kirkboy/bits
 def __init__(self, basename):
     self.basename = basename
     self._file = _StringIO()
     pyfs_add(self.basename, self._do_open, self._do_read)
Ejemplo n.º 33
0
def _safe_repr(object, context, maxlevels, level, enable_pickle=None):
    if enable_pickle is None:
        enable_pickle = ENABLE_PICKLE_DEFAULT
    typ = _type(object)
    if typ is str:
        if 'locale' not in _sys.modules:
            return repr(object), True, False
        if "'" in object and '"' not in object:
            closure = '"'
            quotes = {'"': '\\"'}
        else:
            closure = "'"
            quotes = {"'": "\\'"}
        qget = quotes.get
        sio = _StringIO()
        write = sio.write
        for char in object:
            if char.isalpha():
                write(char)
            else:
                write(qget(char, repr(char)[1:-1]))
        return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False

    r = getattr(typ, "__repr__", None)
    if issubclass(typ, dict) and r is dict.__repr__:
        if not object:
            return "{}", True, False
        objid = _id(object)
        if maxlevels and level >= maxlevels:
            return "{...}", False, objid in context
        if objid in context:
            return _recursion(object), False, True
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        saferepr = _safe_repr
        for k, v in _sorted(object.items()):
            krepr, kreadable, krecur = saferepr(k, context, maxlevels, level)
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
            append("%s: %s" % (krepr, vrepr))
            readable = readable and kreadable and vreadable
            if krecur or vrecur:
                recursive = True
        del context[objid]
        return "{%s}" % _commajoin(components), readable, recursive

    if (issubclass(typ, list) and r is list.__repr__) or \
       (issubclass(typ, tuple) and r is tuple.__repr__):
        if issubclass(typ, list):
            if not object:
                return "[]", True, False
            format = "[%s]"
        elif _len(object) == 1:
            format = "(%s,)"
        else:
            if not object:
                return "()", True, False
            format = "(%s)"
        objid = _id(object)
        if maxlevels and level >= maxlevels:
            return format % "...", False, objid in context
        if objid in context:
            return _recursion(object), False, True
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        for o in object:
            orepr, oreadable, orecur = _safe_repr(
                o, context, maxlevels, level, enable_pickle)
            append(orepr)
            if not oreadable:
                readable = False
            if orecur:
                recursive = True
        del context[objid]
        return format % _commajoin(components), readable, recursive

    ## Use pickle data
    if enable_pickle and _pickleable(typ, object):
        reduce_data = object.__reduce__()
        constructor, args = reduce_data
        constructor = constructor.__name__
        if not args:
            return constructor+"()", True, False
        objid = _id(object)
        if maxlevels and level >= maxlevels:
            return constructor+"(...)", False, objid in context
        if objid in context:
            return _recursion(object), False, True
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        saferepr = _safe_repr
        for arg in args:
            arepr, areadable, arecur = saferepr(arg, context, maxlevels, level)
            append(arepr)
            readable = readable and areadable
            if arecur:
                recursive = True
        del context[objid]
        return constructor + "(%s)" % _commajoin(components), readable, recursive

    rep = repr(object)
    return rep, (rep and not rep.startswith('<')), False
Ejemplo n.º 34
0
 def reset(self):
     self.__buf = _StringIO()
Ejemplo n.º 35
0
 def reset(self):
     self.__buf = _StringIO()
Ejemplo n.º 36
0
 def pformat(self, object, enable_pickle=None):
     if enable_pickle is None:
         enable_pickle = self.enable_pickle
     sio = _StringIO()
     self._format(object, sio, 0, 0, {}, 0, enable_pickle)
     return sio.getvalue()
Ejemplo n.º 37
0
def _safe_repr(object, context, maxlevels, level):
    typ = _type(object)
    if typ is str:
        if 'locale' not in _sys.modules:
            return repr(object), True, False
        if "'" in object and '"' not in object:
            closure = '"'
            quotes = {'"': '\\"'}
        else:
            closure = "'"
            quotes = {"'": "\\'"}
        qget = quotes.get
        sio = _StringIO()
        write = sio.write
        for char in object:
            if char.isalpha():
                write(char)
            else:
                write(qget(char, repr(char)[1:-1]))
        return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False

    r = getattr(typ, "__repr__", None)
    if issubclass(typ, dict) and r == dict.__repr__:
        if not object:
            return "{}", True, False
        objid = _id(object)
        if maxlevels and level >= maxlevels:
            return "{...}", False, objid in context
        if objid in context:
            return _recursion(object), False, True
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        saferepr = _safe_repr
        for k, v in _sorted(object.items()):
            krepr, kreadable, krecur = saferepr(k, context, maxlevels, level)
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
            append("%s: %s" % (krepr, vrepr))
            readable = readable and kreadable and vreadable
            if krecur or vrecur:
                recursive = True
        del context[objid]
        return "{%s}" % _commajoin(components), readable, recursive

    if (issubclass(typ, list) and r == list.__repr__) or \
       (issubclass(typ, tuple) and r == tuple.__repr__):
        if issubclass(typ, list):
            if not object:
                return "[]", True, False
            format = "[%s]"
        elif _len(object) == 1:
            format = "(%s,)"
        else:
            if not object:
                return "()", True, False
            format = "(%s)"
        objid = _id(object)
        if maxlevels and level >= maxlevels:
            return format % "...", False, objid in context
        if objid in context:
            return _recursion(object), False, True
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        for o in object:
            orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level)
            append(orepr)
            if not oreadable:
                readable = False
            if orecur:
                recursive = True
        del context[objid]
        return format % _commajoin(components), readable, recursive

    rep = repr(object)
    return rep, (rep and not rep.startswith('<')), False
Ejemplo n.º 38
0
    def subsample(self, genome_size = 6601757, 
                        read_cov_depth = 80, 
                        pc_loss = 0.2, 
                        force = False, 
                        cov_closeness = 5):
        '''
        Given the size in basepairs of a genome sequence, downsample fastq files to a 
        desired average read coverage depth predicted after read alignment. Read lengths
        are taken from the file. By default, 20% are assumed to be lost at downstream 
        quality control stages (e.g. quality score based trimming). The percent loss is 
        used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent
        subsampling if within 5x coverage: avoids time consuming subsampling that will only 
        make a small difference.
        '''

        subsampled_read_files = {}
        start_time = _time.time()
        for cnum,(pairname,files) in enumerate(self.read_files.items()):
            
            processed_path_1 = insert_suffix(files[1], '_subsmp')
            processed_path_2 = insert_suffix(files[2], '_subsmp')
            
            if not all([_os.path.exists(processed_path_1), 
                        _os.path.exists(processed_path_2)]) \
                    or force:
                
                if files[1][-2:] == 'gz':
                    fh1 = _gzip.open(files[1])
                else:
                    fh1 = open(files[1])
                
                aread = _SeqIO.parse(fh1, 'fastq').next()
                read_len = len(aread.seq)
                
                print('Counting reads in %s' % files[1])
                fh1.seek(0)
                lines = 0
                # report per half million reads
                interval = 2000000
                nextreport = interval
                for line in fh1:
                    lines += 1
                    if lines == nextreport:
                        print('{:,} reads'.format(lines/4))
                        nextreport += interval
                
                totalreads = lines / 4.0
                print('Found %s reads' % totalreads)
                full_depth_coverage = read_len * 2 * totalreads * (1 - pc_loss) / genome_size
                print('These paired read files would provide approximately {:.1f}x coverage depth'.format(full_depth_coverage))
                numreads2keep = int( round(genome_size * read_cov_depth / (read_len * 2) /  (1 - pc_loss), 0) )
                
                if numreads2keep >= totalreads:
                    print('This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.'.format(full_depth_coverage, read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))
                    
                    continue
                elif full_depth_coverage < read_cov_depth + cov_closeness:
                    print('This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.'.format(full_depth_coverage, cov_closeness, read_cov_depth))
                    print('No sampling performed. Original files will be used')
                    # pass original files over with subsampled
                    subsampled_read_files[pairname] = {}
                    subsampled_read_files[pairname][1] = files[1]
                    subsampled_read_files[pairname][2] = files[2]
                    fh1.close()
                    if len(self.read_files) > 1:
                        # report durations, time left etc
                        _report_time(start_time, cnum, len(self.read_files))
                    
                    continue
                else:
                    print('For approximately {}x read coverage, will retain {} of {} {}bp read pairs'.format(
                                    read_cov_depth, numreads2keep, totalreads, read_len))
                    
                    fh1.seek(0)
                    if files[2][-2:] == 'gz':
                        fh2 = _gzip.open(files[2])
                    else:
                        fh2 = open(files[2])
                    
                    fout1 = _gzip.open(processed_path_1, 'wb')
                    fout2 = _gzip.open(processed_path_2, 'wb')
                    
                    batch_size = 200000
                    keep_per_pop = int(numreads2keep / float(totalreads) * batch_size) + 1
                    nextwrite = batch_size
                    written = 0
                    n1 = 0
                    n2 = 0
                    these_lines1 = []
                    these_lines2 = []
                    reportfreq = 10
                    thisreport = 0
                    print('Subsampling . . .')
                    for line in fh1:
                        these_lines1 += [line]
                        if len(these_lines1) % 4 == 0:
                            n1 += 1
                            
                        if n1 == nextwrite:
                            keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
                            keep_these = []
                            for i in keep_indices:
                                i1 = i * 4
                                i2 = i * 4 + 4
                                keep_these += these_lines1[i1:i2]
                            
                            # try parsing a read for QC
                            assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
                            fout1.write(''.join(keep_these))
                            these_lines1 = []
                            written += keep_per_pop
                            thisreport += 1
                            if thisreport == reportfreq or written == keep_per_pop:
                                # report first time and at intevals
                                print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                                 written/float(numreads2keep),
                                                                                 processed_path_1))
                            
                            for line2 in fh2:
                                these_lines2 += [line2]
                                if len(these_lines2) % 4 == 0:
                                    n2 += 1
                                
                                if n2 == nextwrite:
                                    keep_these = []
                                    for i in keep_indices:
                                        i1 = i * 4
                                        i2 = i * 4 + 4
                                        keep_these += these_lines2[i1:i2]
                                    
                                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
                                    fout2.write(''.join(keep_these))
                                    these_lines2 = []
                                    if thisreport == reportfreq or written == keep_per_pop:
                                        thisreport = 0
                                        print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                                         written/float(numreads2keep),
                                                                                         processed_path_2))
                                    nextwrite += batch_size
                                    break
                    
                    # write remainder
                    remainder = nextwrite - n1
                    keep_in_remainder = int(keep_per_pop * (remainder / float(batch_size))) + 1
                    keep_indices = sorted(_sample(xrange(remainder), keep_in_remainder))
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines1[i1:i2]
                    
                    # try parsing a read for QC
                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
                    fout1.write(''.join(keep_these))
                    written += keep_in_remainder
                    print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                             written/float(numreads2keep),
                                                                             processed_path_1))
                    
                    # get remainder
                    for line2 in fh2:
                        these_lines2 += [line2]
                    
                    # write remainder
                    keep_these = []
                    for i in keep_indices:
                        i1 = i * 4
                        i2 = i * 4 + 4
                        keep_these += these_lines2[i1:i2]
                    
                    assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') ###### check why keep_these was empty
                    fout2.write(''.join(keep_these))
                    print('Written {:,} reads ({:.1%}) to {}'.format(written,
                                                                             written/float(numreads2keep),
                                                                             processed_path_2))
                    
                    # not sure if this is quicker/slower (more calls to .join())
                    # this_read = []
                    # for line in fh1:
                        # this_read += [line]
                        # if len(this_read) == 4:
                            # these_reads1 += [''.join(this_read)]
                            # #these_reads1 += this_read
                            # this_read = []
                            # n1 += 1
                            
                        # if n1 == nextwrite:
                            # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
                            # # try parsing a read for QC
                            # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq')
                            # fout1.write(''.join([these_reads1[i] for i in keep_indices]))
                            # these_reads1 = []
                            # written += keep_per_pop
                            # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                                                                             # written/float(numreads2keep),
                                                                             # processed_path_1))
                            # for line2 in fh2:
                                # this_read += [line2]
                                # if len(this_read) == 4:
                                    # these_reads2 += [''.join(this_read)]
                                    # this_read = []
                                    # n2 += 1
                                
                                # if n2 == nextwrite:
                                    # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq')
                                    # fout2.write(''.join([these_reads2[i] for i in keep_indices]))
                                    # these_reads2 = []
                                    # print('Written {:,} reads ({:.2%}) to {}'.format(written,
                                                                                     # written/float(numreads2keep),
                                                                                     # processed_path_2))
                                    # nextwrite += batch_size
                                    # break
                    
                    fout1.close()
                    fout2.close()
                    fh1.close()
                    fh2.close()
                
            else:
                print('Found:')
                print(processed_path_1)
                print(processed_path_2)
                print('use "force = True" to overwrite')
            
            if len(self.read_files) > 1:
                # report durations, time left etc
                _report_time(start_time, cnum, len(self.read_files))
            
            subsampled_read_files[pairname] = {}
            subsampled_read_files[pairname][1] = processed_path_1
            subsampled_read_files[pairname][2] = processed_path_2

        # replace here as this step is optional
        self.fullsized_read_files = list(self.read_files)
        self.read_files = subsampled_read_files
Ejemplo n.º 39
0
def _safe_repr(object, context, maxlevels, level): 
    typ = _type(object) 
    if typ is str: 
        string = object 
        string = string.replace('\n', '\\n').replace('\r','\\r').replace('\t','\\t') 
        if 'locale' not in _sys.modules: 
            return repr(object), True, False 
        if "'" in object and '"' not in object: 
            closure = '"' 
            quotes = {'"': '\\"'} 
            string = string.replace('"','\\"') 
        else: 
            closure = "'" 
            quotes = {"'": "\\'"} 
            string = string.replace("'", "\\'") 
        try: 
            string.decode('utf8').encode('gbk', 'replace') 
            return ("%s%s%s" % (closure, string, closure)), True, False 
        except: 
            pass 
        qget = quotes.get 
        sio = _StringIO() 
        write = sio.write 
        for char in object: 
            if char.isalpha(): 
                write(char) 
            else: 
                write(qget(char, repr(char)[1:-1])) 
        return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False 

    if typ is unicode: 
        string = object.encode("utf8", 'replace') 
        string = string.replace('\n', '\\n').replace('\r','\\r').replace('\t','\\t') 
        if "'" in object and '"' not in object: 
            closure = '"' 
            quotes = {'"': '\\"'} 
            string = string.replace('"','\\"') 
        else: 
            closure = "'" 
            quotes = {"'": "\\'"} 
            string = string.replace("'", "\\'") 
        return ("u%s%s%s" % (closure, string, closure)), True, False 

    r = getattr(typ, "__repr__", None) 
    if issubclass(typ, dict) and r is dict.__repr__: 
        if not object: 
            return "{}", True, False 
        objid = _id(object) 
        if maxlevels and level >= maxlevels: 
            return "{...}", False, objid in context 
        if objid in context: 
            return _recursion(object), False, True 
        context[objid] = 1 
        readable = True 
        recursive = False 
        components = [] 
        append = components.append 
        level += 1 
        saferepr = _safe_repr 
        for k, v in _sorted(object.items()): 
            krepr, kreadable, krecur = saferepr(k, context, maxlevels, level) 
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) 
            append("%s: %s" % (krepr, vrepr)) 
            readable = readable and kreadable and vreadable 
            if krecur or vrecur: 
                recursive = True 
        del context[objid] 
        return "{%s}" % _commajoin(components), readable, recursive 

    if (issubclass(typ, list) and r is list.__repr__) or \
        (issubclass(typ, tuple) and r is tuple.__repr__): 
        if issubclass(typ, list): 
            if not object: 
                return "[]", True, False 
            format = "[%s]" 
        elif _len(object) == 1: 
            format = "(%s,)" 
        else: 
            if not object: 
                return "()", True, False 
            format = "(%s)" 
        objid = _id(object) 
        if maxlevels and level >= maxlevels: 
            return format % "...", False, objid in context 
        if objid in context: 
            return _recursion(object), False, True 
        context[objid] = 1 
        readable = True 
        recursive = False 
        components = [] 
        append = components.append 
        level += 1 
        for o in object: 
            orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level) 
            append(orepr) 
            if not oreadable: 
                readable = False 
            if orecur: 
                recursive = True 
        del context[objid] 
        return format % _commajoin(components), readable, recursive 

    rep = repr(object) 
    return rep, (rep and not rep.startswith('<')), False 
Ejemplo n.º 40
0
 def __init__(self, basename):
     self.basename = basename
     self._file = _StringIO()
     pyfs_add(self.basename, self._do_open, self._do_read)