Esempio n. 1
0
def main():
    '''
    do the thing
    '''
    parser = MyParser(
        description="script name - script description")
    group = parser.add_mutually_exclusive_group()
    group.add_argument("-f", "--f5f",
                       help="File list of fast5 paths")
    group.add_argument("-p", "--f5_path",
                       help="Fast5 top dir")
    group.add_argument("-s", "--signal",
                       help="Extracted signal file from SquigglePull")
    group.add_argument("-i", "--ind",
                       help="Individual fast5 file")
    parser.add_argument("--head", action="store_true",
                       help="Header present in signal or flat file")
    parser.add_argument("-n", "--Num",
                        help="Section of signal to look at - -n 2000 or -n 100,1500")
    parser.add_argument("--scale_hi", type=int, default=1200,
                        help="Upper limit for signal outlier scaling")
    parser.add_argument("--scale_low", type=int, default=0,
                        help="Lower limit for signal outlier scaling")
    # Arguments for now, but best way forward will probably be a config file
    parser.add_argument("--plot_colour", default='grey',
                        help="Colour of signal plot, takes any pyplot entry: k,r,b,g,red,blue,etc...")
    parser.add_argument("--save",
                        help="Save file readname_saveArg.pdf --save saveArg.pdf, use png, etc for other file types")
    parser.add_argument("--save_path",
                        help="Save filepath")
    parser.add_argument("--no_show", action="store_true",
                        help="Do not show plot (used for saving many)")
    parser.add_argument("--dpi", type=int, default=100,
                        help="Change DPI for publication figs, eg: --dpi 300")


    args = parser.parse_args()

    # print help if no arguments given
    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)


    matplotlib.rcParams['savefig.dpi'] = args.dpi

    N = 0
    N1 = 0
    N2 = 0
    if args.Num:
        if ',' in args.Num:
            N1, N2 = args.Num.split(',')
            N1, N2 = int(N1), int(N2)
        else:
            N = int(args.Num)

    head = False
    if args.head:
        head = True


    if args.f5f:
        # file list of fast5 files.
        # fast5_name\tquality_score
        # not using the second column atm
        if args.f5f.endswith('.gz'):
            f_read = dicSwitch('gz')
        else:
            f_read = dicSwitch('norm')
        with f_read(args.f5f, 'rb') as sz:
            if args.f5f.endswith('.gz'):
                sz = io.BufferedReader(sz)
            for l in sz:
                if head:
                    head = False
                    continue
                l = l.strip('\n')
                l = l.split('\t')[0]
                path = l
                l = l.split('/')
                fast5 = l[-1]
                sig = process_fast5(path)
                if not sig:
                    continue
                if N:
                    sig = sig[:N]
                elif N1 or N2:
                    sig = sig[N1:N2]
                sig = scale_outliers(sig, args)
                # output sections
                view_sig(args, sig, fast5)

    elif args.f5_path:
        # process fast5 files given top level path
        for dirpath, dirnames, files in os.walk(args.f5_path):
            for fast5 in files:
                if fast5.endswith('.fast5'):
                    fast5_file = os.path.join(dirpath, fast5)

                    # extract data from file
                    sig = process_fast5(fast5_file)
                    if not sig:
                        print >> sys.stderr, "main():data not extracted. Moving to next file", fast5_file
                        continue
                    if N:
                        sig = sig[:N]
                    elif N1 or N2:
                        sig = sig[N1:N2]
                    sig = np.array(sig, dtype=int)
                    sig = scale_outliers(sig, args)
                    view_sig(args, sig, fast5)

    elif args.signal:
        # signal file, gzipped, from squigglepull
        # testing
        if args.signal.endswith('.gz'):
            f_read = dicSwitch('gz')
        else:
            f_read = dicSwitch('norm')
        with f_read(args.signal, 'rb') as sz:
            if args.signal.endswith('.gz'):
                sz = io.BufferedReader(sz)
            for l in sz:
                if head:
                    head = False
                    continue
                l = l.strip('\n')
                l = l.split('\t')
                fast5 = l[0]
                # modify the l[6:] to the column the data starts...little bit of variability here.
                sig = np.array([int(i) for i in l[4:]], dtype=int)
                if not sig.any():
                    print >> sys.stderr, "nope 1"
                    continue
                if N:
                    sig = sig[:N]
                elif N1 or N2:
                    sig = sig[N1:N2]
                sig = scale_outliers(sig, args)
                view_sig(args, sig, fast5)

    elif args.ind:
        # Do an OS detection here for windows (get from fast5_fetcher)
        fast5 = args.ind.split('/')[-1]
        # extract data from file
        sig = process_fast5(args.ind)
        if not sig:
            print >> sys.stderr, "main():data not extracted.", args.ind
            parser.print_help(sys.stderr)
            sys.exit(1)
        if N:
            sig = sig[:N]
        elif N1 or N2:
            sig = sig[N1:N2]
        sig = np.array(sig, dtype=int)
        sig = scale_outliers(sig, args)
        view_sig(args, sig, fast5)

    else:
        print >> sys.stderr, "Unknown file or path input"
        parser.print_help(sys.stderr)
        sys.exit(1)

    print >> sys.stderr, "Done"
Esempio n. 2
0
def get_readable_fileobj(name_or_obj,
                         encoding=None,
                         cache=False,
                         show_progress=True,
                         remote_timeout=None):
    """
    Given a filename, pathlib.Path object or a readable file-like object, return a context
    manager that yields a readable file-like object.

    This supports passing filenames, URLs, and readable file-like objects,
    any of which can be compressed in gzip, bzip2 or lzma (xz) if the
    appropriate compression libraries are provided by the Python installation.

    Notes
    -----

    This function is a context manager, and should be used for example
    as::

        with get_readable_fileobj('file.dat') as f:
            contents = f.read()

    Parameters
    ----------
    name_or_obj : str or file-like object
        The filename of the file to access (if given as a string), or
        the file-like object to access.

        If a file-like object, it must be opened in binary mode.

    encoding : str, optional
        When `None` (default), returns a file-like object with a
        ``read`` method that on Python 2.x returns `bytes` objects and
        on Python 3.x returns `str` (``unicode``) objects, using
        `locale.getpreferredencoding` as an encoding.  This matches
        the default behavior of the built-in `open` when no ``mode``
        argument is provided.

        When ``'binary'``, returns a file-like object where its ``read``
        method returns `bytes` objects.

        When another string, it is the name of an encoding, and the
        file-like object's ``read`` method will return `str` (``unicode``)
        objects, decoded from binary using the given encoding.

    cache : bool, optional
        Whether to cache the contents of remote URLs.

    show_progress : bool, optional
        Whether to display a progress bar if the file is downloaded
        from a remote server.  Default is `True`.

    remote_timeout : float
        Timeout for remote requests in seconds (default is the configurable
        `astropy.utils.data.Conf.remote_timeout`, which is 3s by default)

    Returns
    -------
    file : readable file-like object
    """

    # close_fds is a list of file handles created by this function
    # that need to be closed.  We don't want to always just close the
    # returned file handle, because it may simply be the file handle
    # passed in.  In that case it is not the responsibility of this
    # function to close it: doing so could result in a "double close"
    # and an "invalid file descriptor" exception.
    PATH_TYPES = (str, pathlib.Path)

    close_fds = []
    delete_fds = []

    if remote_timeout is None:
        # use configfile default
        remote_timeout = conf.remote_timeout

    # Get a file object to the content
    if isinstance(name_or_obj, PATH_TYPES):
        # name_or_obj could be a Path object if pathlib is available
        name_or_obj = str(name_or_obj)

        is_url = _is_url(name_or_obj)
        if is_url:
            name_or_obj = download_file(name_or_obj,
                                        cache=cache,
                                        show_progress=show_progress,
                                        timeout=remote_timeout)
        fileobj = io.FileIO(name_or_obj, 'r')
        if is_url and not cache:
            delete_fds.append(fileobj)
        close_fds.append(fileobj)
    else:
        fileobj = name_or_obj

    # Check if the file object supports random access, and if not,
    # then wrap it in a BytesIO buffer.  It would be nicer to use a
    # BufferedReader to avoid reading loading the whole file first,
    # but that is not compatible with streams or urllib2.urlopen
    # objects on Python 2.x.
    if not hasattr(fileobj, 'seek'):
        fileobj = io.BytesIO(fileobj.read())

    # Now read enough bytes to look at signature
    signature = fileobj.read(4)
    fileobj.seek(0)

    if signature[:3] == b'\x1f\x8b\x08':  # gzip
        import struct
        try:
            import gzip
            fileobj_new = gzip.GzipFile(fileobj=fileobj, mode='rb')
            fileobj_new.read(1)  # need to check that the file is really gzip
        except (IOError, EOFError):  # invalid gzip file
            fileobj.seek(0)
            fileobj_new.close()
        except struct.error:  # invalid gzip file on Python 3
            fileobj.seek(0)
            fileobj_new.close()
        else:
            fileobj_new.seek(0)
            fileobj = fileobj_new
    elif signature[:3] == b'BZh':  # bzip2
        try:
            import bz2
        except ImportError:
            for fd in close_fds:
                fd.close()
            raise ValueError(
                ".bz2 format files are not supported since the Python "
                "interpreter does not include the bz2 module")
        try:
            # bz2.BZ2File does not support file objects, only filenames, so we
            # need to write the data to a temporary file
            with NamedTemporaryFile("wb", delete=False) as tmp:
                tmp.write(fileobj.read())
                tmp.close()
                fileobj_new = bz2.BZ2File(tmp.name, mode='rb')
            fileobj_new.read(1)  # need to check that the file is really bzip2
        except IOError:  # invalid bzip2 file
            fileobj.seek(0)
            fileobj_new.close()
            # raise
        else:
            fileobj_new.seek(0)
            close_fds.append(fileobj_new)
            fileobj = fileobj_new
    elif signature[:3] == b'\xfd7z':  # xz
        try:
            import lzma
            fileobj_new = lzma.LZMAFile(fileobj, mode='rb')
            fileobj_new.read(1)  # need to check that the file is really xz
        except ImportError:
            for fd in close_fds:
                fd.close()
            raise ValueError(
                ".xz format files are not supported since the Python "
                "interpreter does not include the lzma module.")
        except (IOError, EOFError) as e:  # invalid xz file
            fileobj.seek(0)
            fileobj_new.close()
            # should we propagate this to the caller to signal bad content?
            # raise ValueError(e)
        else:
            fileobj_new.seek(0)
            fileobj = fileobj_new

    # By this point, we have a file, io.FileIO, gzip.GzipFile, bz2.BZ2File
    # or lzma.LZMAFile instance opened in binary mode (that is, read
    # returns bytes).  Now we need to, if requested, wrap it in a
    # io.TextIOWrapper so read will return unicode based on the
    # encoding parameter.

    needs_textio_wrapper = encoding != 'binary'

    if needs_textio_wrapper:
        # A bz2.BZ2File can not be wrapped by a TextIOWrapper,
        # so we decompress it to a temporary file and then
        # return a handle to that.
        try:
            import bz2
        except ImportError:
            pass
        else:
            if isinstance(fileobj, bz2.BZ2File):
                tmp = NamedTemporaryFile("wb", delete=False)
                data = fileobj.read()
                tmp.write(data)
                tmp.close()
                delete_fds.append(tmp)

                fileobj = io.FileIO(tmp.name, 'r')
                close_fds.append(fileobj)

        fileobj = io.BufferedReader(fileobj)
        fileobj = io.TextIOWrapper(fileobj, encoding=encoding)

        # Ensure that file is at the start - io.FileIO will for
        # example not always be at the start:
        # >>> import io
        # >>> f = open('test.fits', 'rb')
        # >>> f.read(4)
        # 'SIMP'
        # >>> f.seek(0)
        # >>> fileobj = io.FileIO(f.fileno())
        # >>> fileobj.tell()
        # 4096L

        fileobj.seek(0)

    try:
        yield fileobj
    finally:
        for fd in close_fds:
            fd.close()
        for fd in delete_fds:
            os.remove(fd.name)
Esempio n. 3
0
    def __init__(self,
                 filename=None,
                 mode=None,
                 compresslevel=_COMPRESS_LEVEL_BEST,
                 fileobj=None,
                 mtime=None):
        """Constructor for the GzipFile class.

        At least one of fileobj and filename must be given a
        non-trivial value.

        The new class instance is based on fileobj, which can be a regular
        file, an io.BytesIO object, or any other object which simulates a file.
        It defaults to None, in which case filename is opened to provide
        a file object.

        When fileobj is not None, the filename argument is only used to be
        included in the gzip file header, which may include the original
        filename of the uncompressed file.  It defaults to the filename of
        fileobj, if discernible; otherwise, it defaults to the empty string,
        and in this case the original filename is not included in the header.

        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
        'xb' depending on whether the file will be read or written.  The default
        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
        A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
        'wb', 'a' and 'ab', and 'x' and 'xb'.

        The compresslevel argument is an integer from 0 to 9 controlling the
        level of compression; 1 is fastest and produces the least compression,
        and 9 is slowest and produces the most compression. 0 is no compression
        at all. The default is 9.

        The mtime argument is an optional numeric timestamp to be written
        to the last modification time field in the stream when compressing.
        If omitted or None, the current time is used.

        """

        if mode and ('t' in mode or 'U' in mode):
            raise ValueError("Invalid mode: {!r}".format(mode))
        if mode and 'b' not in mode:
            mode += 'b'
        if fileobj is None:
            fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
        if filename is None:
            filename = getattr(fileobj, 'name', '')
            if not isinstance(filename, (str, bytes)):
                filename = ''
        else:
            filename = os.fspath(filename)
        origmode = mode
        if mode is None:
            mode = getattr(fileobj, 'mode', 'rb')

        if mode.startswith('r'):
            self.mode = READ
            raw = _GzipReader(fileobj)
            self._buffer = io.BufferedReader(raw)
            self.name = filename

        elif mode.startswith(('w', 'a', 'x')):
            if origmode is None:
                import warnings
                warnings.warn(
                    "GzipFile was opened for writing, but this will "
                    "change in future Python releases.  "
                    "Specify the mode argument for opening it for writing.",
                    FutureWarning, 2)
            self.mode = WRITE
            self._init_write(filename)
            self.compress = zlib.compressobj(compresslevel, zlib.DEFLATED,
                                             -zlib.MAX_WBITS,
                                             zlib.DEF_MEM_LEVEL, 0)
            self._write_mtime = mtime
        else:
            raise ValueError("Invalid mode: {!r}".format(mode))

        self.fileobj = fileobj

        if self.mode == WRITE:
            self._write_gzip_header(compresslevel)
    def openbin(
            self,
            path,  # type: Text
            mode="r",  # type: Text
            buffering=-1,  # type: int
            **options  # type: Any
    ):
        # type: (...) -> BinaryIO
        """Open a binary file-like object.

        Arguments:
            path (str): A path on the filesystem.
            mode (str): Mode to open file (must be a valid non-text mode,
                defaults to *r*). Since this method only opens binary files,
                the ``b`` in the mode string is implied.
            buffering (int): Buffering policy (-1 to use default buffering,
                0 to disable buffering, or any positive integer to indicate
                a buffer size).
            **options: keyword arguments for any additional information
                required by the filesystem (if any).

        Returns:
            io.IOBase: a *file-like* object.

        Raises:
            fs.errors.FileExpected: If the path is not a file.
            fs.errors.FileExists: If the file exists, and *exclusive mode*
                is specified (``x`` in the mode).
            fs.errors.ResourceNotFound: If the path does not exist.

        """
        _mode = Mode(mode)
        _mode.validate_bin()
        _path = self.validatepath(path)
        dir_path, file_name = split(_path)

        if not file_name:
            raise errors.FileExpected(path)

        with self._lock:
            _dir_res = self._getresource(dir_path)
            if not _dir_res or not _dir_res.is_collection:
                raise errors.ResourceNotFound(path)

            if _mode.create:
                if file_name in _dir_res.get_member_names():
                    if _mode.exclusive:
                        raise errors.FileExists(path)

                    _res = self._getresource(path)
                    if not _res or _res.is_collection:
                        raise errors.FileExpected(path)

                    stream = io.BufferedWriter(_res.begin_write())
                    io_object = RawWrapper(stream, mode=mode, name=path)
                    return io_object

                _res = _dir_res.create_empty_resource(file_name)
                stream = io.BufferedWriter(_res.begin_write())
                io_object = RawWrapper(stream, mode=mode, name=path)
                return io_object

            if file_name not in _dir_res.get_member_names():
                raise errors.ResourceNotFound(path)

            _res = self._getresource(path)
            if not _res or _res.is_collection:
                raise errors.FileExpected(path)

            if _mode.appending:
                # stream.seek(0, 2)  # io.SEEK_END
                raise NotImplementedError("Appending is not supported")

            if _mode.updating:
                raise NotImplementedError("Updating is not supported")

            if _mode.reading:
                stream = io.BufferedReader(_res.get_content())
                io_object = RawWrapper(stream, mode=mode, name=path)
                return io_object

            stream = io.BufferedWriter(_res.begin_write())
            io_object = RawWrapper(stream, mode=mode, name=path)
            return io_object
Esempio n. 5
0
def main():
    # parse command line options
    parser = OptionParser()
    parser.add_option("--inFile",
                      "-i",
                      dest="input_file",
                      default=None,
                      help="Full path of input fastq file.")
    parser.add_option("--outFile",
                      "-o",
                      dest="out_file",
                      default=None,
                      help="Full path of output file.")
    parser.add_option("--numSample",
                      "-n",
                      dest="num_sample",
                      default=None,
                      help="Number of sampled reads.")
    parser.add_option("--ungzip",
                      dest="num_sample",
                      default=None,
                      help="Number of sampled reads.")
    parser.add_option("--plainOut",
                      action="store_true",
                      dest="plain_out",
                      default=False,
                      help="Save plain text file for output, not gzip.")

    (options, args) = parser.parse_args()
    if len(sys.argv[1:]) == 0:
        print("Welcome to Fastq-Sample!\n")
        print("use -h or --help for help on argument.")
        sys.exit(1)

    if options.input_file is None:
        print("Error: need input fastq file.")
        sys.exit(1)
    else:
        input_file = options.input_file
        file_name = ".".join(os.path.basename(input_file).split(".")[:-1])

    if options.num_sample is None:
        print("Error: need -n numSample for number of sampled reads.")
        sys.exit(1)
    else:
        num_sample = int(options.num_sample)

    if options.out_file is None:
        out_file = os.path.dirname(
            input_file) + "/random%d.fq.gz" % (num_sample)
    else:
        out_file = options.out_file

    START_TIME = time.time()

    # counting total reads
    total_reads = 0
    try:
        infile = gzip.open(input_file, 'rb')
        with io.BufferedReader(infile) as f:
            for line in f:
                total_reads += 1
        ftype = "gzip"
    except:
        infile = open(input_file, 'rb')
        for line in infile:
            total_reads += 1
        ftype = "plain"
    infile.close()
    total_reads = total_reads / 4
    sys.stdout.write('\r[Fastq-Sample] Sample %d out of %d reads.' %
                     (num_sample, total_reads))
    sys.stdout.flush()

    # generate random reads index
    idx_out = np.random.permutation(total_reads)[:num_sample]
    idx_out = np.sort(idx_out)

    ## print idx_out, len(idx_out)
    ## idx_out = np.arange(100)

    # output sampled reads
    if ftype == "gzip":
        infile = io.BufferedReader(gzip.open(input_file, 'rb'))
    else:
        infile = open(input_file, 'rb')

    if options.plain_out is True:
        outfile = open(out_file, "w")
    else:
        outfile = gzip.open(out_file, "w")

    outCNT, lineCNT = 0, -1
    for line in infile:
        lineCNT += 1
        if int(lineCNT / 4) == idx_out[outCNT]:
            outfile.writelines(line)
            if lineCNT % 4 == 3: outCNT += 1
        if outCNT >= len(idx_out):
            break
    infile.close()
    outfile.close()

    run_time = time.time() - START_TIME
    sys.stdout.write(
        ('\r[Fastq-Sample] Sample %d out of %d reads. '
         'Done in %.2f sec.' % (num_sample, total_reads, run_time)))
    sys.stdout.flush()
    print("")
Esempio n. 6
0
    def __init__(self, filename, mode="r", *, compresslevel=9):
        """Open a bzip2-compressed file.

        If filename is a str, bytes, or PathLike object, it gives the
        name of the file to be opened. Otherwise, it should be a file
        object, which will be used to read or write the compressed data.

        mode can be 'r' for reading (default), 'w' for (over)writing,
        'x' for creating exclusively, or 'a' for appending. These can
        equivalently be given as 'rb', 'wb', 'xb', and 'ab'.

        If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
        and 9 specifying the level of compression: 1 produces the least
        compression, and 9 (default) produces the most compression.

        If mode is 'r', the input file may be the concatenation of
        multiple compressed streams.
        """
        # This lock must be recursive, so that BufferedIOBase's
        # writelines() does not deadlock.
        self._lock = RLock()
        self._fp = None
        self._closefp = False
        self._mode = _MODE_CLOSED

        if not (1 <= compresslevel <= 9):
            raise ValueError("compresslevel must be between 1 and 9")

        if mode in ("", "r", "rb"):
            mode = "rb"
            mode_code = _MODE_READ
        elif mode in ("w", "wb"):
            mode = "wb"
            mode_code = _MODE_WRITE
            self._compressor = BZ2Compressor(compresslevel)
        elif mode in ("x", "xb"):
            mode = "xb"
            mode_code = _MODE_WRITE
            self._compressor = BZ2Compressor(compresslevel)
        elif mode in ("a", "ab"):
            mode = "ab"
            mode_code = _MODE_WRITE
            self._compressor = BZ2Compressor(compresslevel)
        else:
            raise ValueError("Invalid mode: %r" % (mode, ))

        if isinstance(filename, (str, bytes, os.PathLike)):
            self._fp = _builtin_open(filename, mode)
            self._closefp = True
            self._mode = mode_code
        elif hasattr(filename, "read") or hasattr(filename, "write"):
            self._fp = filename
            self._mode = mode_code
        else:
            raise TypeError(
                "filename must be a str, bytes, file or PathLike object")

        if self._mode == _MODE_READ:
            raw = _compression.DecompressReader(self._fp,
                                                BZ2Decompressor,
                                                trailing_error=OSError)
            self._buffer = io.BufferedReader(raw)
        else:
            self._pos = 0
Esempio n. 7
0
    sample = dataModelMfccPhon()
    sample.loadFromCSV(reader)
    if not sample.isBegin():
        dataSet.append(sample)

print "Got " + str(len(dataSet)) + " samples"

mfccCol = MfccCollection()
mfccCol.loadFromMfccDict(sys.argv[3])

zf = zipfile.ZipFile(sys.argv[1])

model = model_from_json(zf.read("config.json"))
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.set_weights(np.load(io.BufferedReader(zf.open("weight.npy", mode='r'))))

header_list = list()
sample_array = []

for elt in dataSet:
    sample_array.append(elt.getContextArray(mfccCol))

prediction = model.predict(np.asarray(sample_array))

output = csvWriter(sys.argv[4])
index_prediction = 0

for elt in dataSet:
    output.addLine((elt.infoRef[0], elt.good, prediction[index_prediction][0]))
    index_prediction += 1
Esempio n. 8
0
 def clean (self, data):
   data.reverse( )
   self.data = self.eat_nulls(data)
   self.stream = io.BufferedReader(io.BytesIO(self.data))
Esempio n. 9
0
def smart_open(filename):
    if filename.endswith('.gz'):
        return io.BufferedReader(gzip.open(filename))
    return open(filename)
Esempio n. 10
0
import sys
import gzip
import io

print("chrBase" + "\t" + "chr" + "\t" + "base" + "\t" + "strand" + "\t" +
      "coverage" + "\t" + "freqC" + "\t" + "freqT")
CGmap = gzip.open(sys.argv[1], 'rt')
f = io.BufferedReader(CGmap)
for call in CGmap:
    chr, strand, pos, type, dinucleotide, perc_C, mC, cov = call.strip().split(
    )
    if dinucleotide == "CG":
        outlist = []
        if strand == "C":
            strand = "F"
        else:
            strand = "R"
        perc_C = float(perc_C) * 100
        perc_T = 100 - (perc_C)
        outlist.append(str("chr" + chr + "." + pos))
        outlist.append(str("chr" + chr))
        outlist.append(str(pos))
        outlist.append(strand)
        outlist.append(str(cov))
        outlist.append(str(perc_C))
        outlist.append(str(perc_T))
        print("\t".join(outlist))
Esempio n. 11
0
def paired2single(fq1, fq2, barcodes, mismatch, fq, others, tso, polya,
                  min_len):
    buffer_max = 100000000
    mismatch = int(mismatch)  # in case of str
    barcodes_mis_dict = mismatch_dict(barcodes, mismatch)
    tso_n = len(tso)
    out1 = io.BufferedWriter(gzip.open(fq, 'w'), buffer_size=buffer_max)
    out2 = io.BufferedWriter(gzip.open(others, 'w'), buffer_size=buffer_max)
    bbcount = dict(
        list(
            zip(barcodes, [[[0 for i in range(mismatch + 1)] for j in range(2)]
                           for k in range(len(barcodes))])))
    #bbcount['ambiguous'] = [0 for i in range(2)]
    bbcount['unmatched'] = [0 for j in range(2)]

    if len(fq1) == len(fq2):
        for i in range(0, len(fq1)):
            in1 = io.BufferedReader(gzip.open(fq1[i], 'rU'),
                                    buffer_size=buffer_max)
            in2 = io.BufferedReader(gzip.open(fq2[i], 'rU'),
                                    buffer_size=buffer_max)
            r1 = FastqGeneralIterator(in1)
            r2 = FastqGeneralIterator(in2)
            buffer_i = 0
            for r in r2:
                rr = next(r1)

                tag = r[1][0:8]
                isbar_flag = False
                if tag in barcodes_mis_dict:
                    bb = barcodes_mis_dict[tag]
                    bbcount[bb[1]][0][bb[0]] += 1
                    isbar_flag = True
                else:
                    bbcount['unmatched'][0] += 1

                rr0 = rr[0]
                rr1 = rr[1]
                rr2 = rr[2]
                # trim tso and polya , and skip read less than 50nt
                ind_tso = 0
                ind_polya = len(rr1)
                ind_flag = False
                if tso in rr1:  # using `in` first to save time in case that most reads did not include tso or polya
                    ind_tso = rr1.rfind(
                        tso) + tso_n + 3  # sometimes GGG is at the end of tso
                    ind_flag = True
                if polya in rr1:
                    ind_polya = rr1.find(polya)
                    ind_flag = True
                if ind_flag:
                    if ((ind_polya - ind_tso) >= min_len):
                        rr1 = rr1[ind_tso:ind_polya]
                        rr2 = rr2[ind_tso:ind_polya]
                    else:
                        continue
                if isbar_flag:
                    rr0 = bb[1] + r[1][8:16] + '_' + rr0
                    out1.write('@%s\n%s\n+\n%s\n' % (rr0, rr1, rr2))
                    bbcount[bb[1]][1][bb[0]] += 1
                else:
                    rr0 = r[1][0:16] + '_' + rr0
                    out2.write('@%s\n%s\n+\n%s\n' % (rr0, rr1, rr2))
                    bbcount['unmatched'][1] += 1
            in1.close()
            in2.close()
    else:
        return None
    out1.flush()
    out2.flush()
    out1.close()
    out2.close()
    return bbcount
Esempio n. 12
0
    def __init__(self, **kwargs):
        other = kwargs.get("other", None)
        if other:
            self.fromOther(other)

        else:
            self.bufferPool = kwargs.get("bufferPool", None)
            if self.bufferPool is None:
                raise ValueError(
                    "No buffer pool found when initializing a storage file")

            fileId = kwargs.get("fileId", None)
            filePath = kwargs.get("filePath", None)
            mode = kwargs.get("mode", None)
            existing = os.path.exists(filePath)

            if fileId and filePath:
                initHeader = False
                initFreePages = False

                if not existing and mode.lower() == "create":
                    ioMode = "w+b"
                    pageSize = kwargs.get("pageSize", io.DEFAULT_BUFFER_SIZE)
                    pageClass = kwargs.get("pageClass",
                                           StorageFile.defaultPageClass)
                    schema = kwargs.get("schema", None)
                    if pageSize and pageClass and schema:
                        self.header = FileHeader(pageSize=pageSize,
                                                 pageClass=pageClass,
                                                 schema=schema)
                        initHeader = True
                        initFreePages = False
                    else:
                        raise ValueError(
                            "No page size, class or schema specified when creating a new storage file"
                        )

                elif existing and mode.lower() in ["update", "truncate"]:
                    ioMode = "r+b" if mode.lower() == "update" else "w+b"
                    f = io.BufferedReader(io.FileIO(filePath))
                    self.header = FileHeader.fromFile(f)
                    pageSize = self.pageSize()
                    initFreePages = True
                    f.close()

                else:
                    raise ValueError(
                        "Incompatible storage file mode and on-disk file status"
                    )

                if self.header:
                    self.fileId = fileId
                    self.path = filePath
                    self.file = io.BufferedRandom(io.FileIO(self.path, ioMode),
                                                  buffer_size=pageSize)
                    self.binrepr = Struct("H" + str(FileId.binrepr.size) +
                                          "s" + str(len(self.path)) + "s")
                    self.freePages = set()

                    page = self.pageClass()(pageId=self.pageId(0),
                                            buffer=bytes(self.pageSize()),
                                            schema=self.schema())
                    self.pageHdrSize = page.header.headerSize()

                    if initFreePages:
                        self.initializeFreePages()

                    if initHeader:
                        self.refreshFileHeader()

                else:
                    raise ValueError(
                        "No valid header available for storage file")
            else:
                raise ValueError(
                    "No file id or path specified in storage file constructor")
Esempio n. 13
0
    def load_pyrnn_model(cls, path: str):
        """
        Loads an pyrnn model to VGSL.
        """
        if not PY2:
            raise KrakenInvalidModelException(
                'Loading pickle models is not supported on python 3')

        import cPickle

        def find_global(mname, cname):
            aliases = {
                'lstm.lstm': kraken.lib.lstm,
                'ocrolib.lstm': kraken.lib.lstm,
                'ocrolib.lineest': kraken.lib.lineest,
            }
            if mname in aliases:
                return getattr(aliases[mname], cname)
            return getattr(sys.modules[mname], cname)

        of = io.open
        if path.endswith(u'.gz'):
            of = gzip.open
        with io.BufferedReader(of(path, 'rb')) as fp:
            unpickler = cPickle.Unpickler(fp)
            unpickler.find_global = find_global
            try:
                net = unpickler.load()
            except Exception as e:
                raise KrakenInvalidModelException(str(e))
            if not isinstance(net, kraken.lib.lstm.SeqRecognizer):
                raise KrakenInvalidModelException('Pickle is %s instead of '
                                                  'SeqRecognizer' %
                                                  type(net).__name__)
        # extract codec
        codec = PytorchCodec({k: [v] for k, v in net.codec.char2code.items()})

        input = net.Ni
        parallel, softmax = net.lstm.nets
        fwdnet, revnet = parallel.nets
        revnet = revnet.net

        hidden = fwdnet.WGI.shape[0]

        # extract weights
        weightnames = ('WGI', 'WGF', 'WCI', 'WGO', 'WIP', 'WFP', 'WOP')

        fwd_w = []
        rev_w = []
        for w in weightnames:
            fwd_w.append(torch.Tensor(getattr(fwdnet, w)))
            rev_w.append(torch.Tensor(getattr(revnet, w)))

        t = torch.cat(fwd_w[:4])
        weight_ih_l0 = t[:, :input + 1]
        weight_hh_l0 = t[:, input + 1:]

        t = torch.cat(rev_w[:4])
        weight_ih_l0_rev = t[:, :input + 1]
        weight_hh_l0_rev = t[:, input + 1:]

        weight_lin = torch.Tensor(softmax.W2)

        # build vgsl spec and set weights
        nn = cls('[1,1,0,{} Lbxo{} O1ca{}]'.format(input, hidden,
                                                   len(net.codec.code2char)))

        nn.nn.L_0.layer.weight_ih_l0 = torch.nn.Parameter(weight_ih_l0)
        nn.nn.L_0.layer.weight_hh_l0 = torch.nn.Parameter(weight_hh_l0)
        nn.nn.L_0.layer.weight_ih_l0_reverse = torch.nn.Parameter(
            weight_ih_l0_rev)
        nn.nn.L_0.layer.weight_hh_l0_reverse = torch.nn.Parameter(
            weight_hh_l0_rev)
        nn.nn.L_0.layer.weight_ip_l0 = torch.nn.Parameter(fwd_w[4])
        nn.nn.L_0.layer.weight_fp_l0 = torch.nn.Parameter(fwd_w[5])
        nn.nn.L_0.layer.weight_op_l0 = torch.nn.Parameter(fwd_w[6])
        nn.nn.L_0.layer.weight_ip_l0_reverse = torch.nn.Parameter(rev_w[4])
        nn.nn.L_0.layer.weight_fp_l0_reverse = torch.nn.Parameter(rev_w[5])
        nn.nn.L_0.layer.weight_op_l0_reverse = torch.nn.Parameter(rev_w[6])

        nn.nn.O_1.lin.weight = torch.nn.Parameter(weight_lin)

        nn.add_codec(codec)

        return nn
    out_string = ' '.join(curr_entry)
    for i, mem in enumerate(curr_entry_mem):
        out_string += ' ' + ' '.join([mem, curr_entry_mem_addr[i]])
    return out_string


if isa == 'x86':
    start_recording = False
    stop_recording = False
    in_file_dump_micro = in_file_base + '_dump_micro.gz'
    in_file_mem_dump = in_file_base + '_mem_dump.gz'
    # TODO: rename file for simplified trace (too verbose)
    out_filename = app_prefix + '_clean_dump_parsed_merged.txt'

    dis_list = gzip.open(in_file_dump_micro)
    dis_io = io.BufferedReader(dis_list)
    mem_list = gzip.open(in_file_mem_dump)
    mem_io = io.BufferedReader(mem_list)

    pc = None
    prev_pc = pc
    curr_entry = []
    curr_entry_mem = []
    curr_entry_mem_addr = []
    mem_file_done = False
    outfile = open(out_filename, 'w')
    mem_tick = 0
    mem_line = ''
    mem_entry = []

    for line in dis_io:
Esempio n. 15
0
 def buffered_flo(content):
     clean = re.sub("data:application/octet-stream;base64,", '', content)
     floBytes = io.BytesIO(base64.b64decode(clean))
     return io.BufferedReader(floBytes)
Esempio n. 16
0
 def parse():
     self.context_ = [dict()]
     with fopen(self.filename, "r") as file:
         iobuf = io.BufferedReader(file)
         for line in iobuf:
             parse_line(line, self.context_)
Esempio n. 17
0
 def _makefile(sock, mode):
     return io.BufferedReader(SocketIO(sock, mode))
Esempio n. 18
0
def textblock(filename, start, end, compression=None, encoding=system_encoding,
              linesep=os.linesep, buffersize=4096):
    """Pull out a block of text from a file given start and stop bytes.

    This gets data starting/ending from the next linesep delimiter. Each block
    consists of bytes in the range [start,end[, i.e. the stop byte is excluded.
    If `start` is 0, then `start` corresponds to the true start byte. If
    `start` is greater than 0 and does not point to the beginning of a new
    line, then `start` is incremented until it corresponds to the start byte of
    the next line. If `end` does not point to the beginning of a new line, then
    the line that begins before `end` is included in the block although its
    last byte exceeds `end`.

    Examples
    --------
    >> with open('myfile.txt', 'wb') as f:
    ..     f.write('123\n456\n789\nabc')

    In the example below, 1 and 10 don't line up with endlines.

    >> u''.join(textblock('myfile.txt', 1, 10))
    '456\n789\n'
    """
    # Make sure `linesep` is not a byte string because
    # `io.TextIOWrapper` in Python versions other than 2.7 dislike byte
    # strings for the `newline` argument.
    linesep = str(linesep)

    # Get byte representation of the line separator.
    bin_linesep = get_bin_linesep(encoding, linesep)
    bin_linesep_len = len(bin_linesep)

    if buffersize < bin_linesep_len:
        error = ('`buffersize` ({0:d}) must be at least as large as the '
                 'number of line separator bytes ({1:d}).')
        raise ValueError(error.format(buffersize, bin_linesep_len))

    chunksize = end - start

    with open(filename, 'rb', compression) as f:
        with io.BufferedReader(f) as fb:
            # If `start` does not correspond to the beginning of the file, we
            # need to move the file pointer to `start - len(bin_linesep)`,
            # search for the position of the next a line separator, and set
            # `start` to the position after that line separator.
            if start > 0:
                # `start` is decremented by `len(bin_linesep)` to detect the
                # case where the original `start` value corresponds to the
                # beginning of a line.
                start = max(0, start - bin_linesep_len)
                # Set the file pointer to `start`.
                fb.seek(start)
                # Number of bytes to shift the file pointer before reading a
                # new chunk to make sure that a multi-byte line separator, that
                # is split by the chunk reader, is still detected.
                shift = 1 - bin_linesep_len
                while True:
                    buf = f.read(buffersize)
                    if len(buf) < bin_linesep_len:
                        raise StopIteration
                    try:
                        # Find the position of the next line separator and add
                        # `len(bin_linesep)` which yields the position of the
                        # first byte of the next line.
                        start += buf.index(bin_linesep)
                        start += bin_linesep_len
                    except ValueError:
                        # No line separator was found in the current chunk.
                        # Before reading the next chunk, we move the file
                        # pointer back `len(bin_linesep) - 1` bytes to make
                        # sure that a multi-byte line separator, that may have
                        # been split by the chunk reader, is still detected.
                        start += len(buf)
                        start += shift
                        fb.seek(shift, os.SEEK_CUR)
                    else:
                        # We have found the next line separator, so we need to
                        # set the file pointer to the first byte of the next
                        # line.
                        fb.seek(start)
                        break

            with io.TextIOWrapper(fb, encoding, newline=linesep) as fbw:
                # Retrieve and yield lines until the file pointer reaches
                # `end`.
                while start < end:
                    line = next(fbw)
                    # We need to encode the line again to get the byte length
                    # in order to correctly update `start`.
                    bin_line_len = len(line.encode(encoding))
                    if chunksize < bin_line_len:
                        error = ('`chunksize` ({0:d}) is less than the line '
                                 'length ({1:d}). This may cause duplicate '
                                 'processing of this line. It is advised to '
                                 'increase `chunksize`.')
                        raise IOError(error.format(chunksize, bin_line_len))

                    yield line
                    start += bin_line_len
def main(fq1, fq2, nseq=100000, seq_length=50, skip_first_N=1, verbose=True):
    is_gz = (fq1[-3:].lower() == '.gz') & (fq2[-3:].lower() == '.gz')

    if is_gz:
        f1 = io.TextIOWrapper(io.BufferedReader(gzip.open(fq1)))
        f2 = io.TextIOWrapper(io.BufferedReader(gzip.open(fq2)))
    else:
        f1 = open(fq1, 'rb')
        f2 = open(fq2, 'rb')

    num_read = 0
    n = 0
    try:
        g = fastq_paired_reader(f1, f2)
        # get candidate sequences
        candidates = {}
        candidate_count = collections.Counter()
        candidate_by_seq = {}
        sequence_list = []
        seq_set = set()
        i = 0
        skipped = 0
        dupes = 0

        while i < nseq:
            the_block = g.next()
            the_id = the_block.pop('id')
            seq_1 = the_block['seq_1'][:seq_length]
            seq_2 = the_block['seq_2'][:seq_length]

            the_seq = seq_1[skip_first_N:] + seq_2[skip_first_N:]
            the_N_count = len(the_seq) - len(the_seq.replace('N', ''))

            if the_N_count > 0:
                skipped += 1
                continue

            candidates[the_id] = {
                'seq_1': seq_1,
                'seq_2': seq_2,
                'hash_seq': the_seq,
            }
            i += 1

            if the_seq in seq_set:
                candidate_count[candidate_by_seq[the_seq]] += 1
                dupes += 1
            else:
                seq_set.add(the_seq)
                candidate_by_seq[the_seq] = the_id
                candidate_count[the_id] += 1

            sequence_list.append((the_seq, the_id))

        if verbose:
            print "Finished recording %d candidate pairs. " \
                  "Skipped %d as they contained undetermined bases. " \
                  "Identified %d duplicates" % (
                nseq,
                skipped,
                dupes
            )
        skipped = 0

        # run through the remainder
        for i, the_block in enumerate(g):
            if verbose and i % 500000 == 0 and i != 0:
                print "%d lines read" % i
            seq_1 = the_block.pop('seq_1')[:seq_length]
            seq_2 = the_block.pop('seq_2')[:seq_length]

            the_seq = seq_1[skip_first_N:] + seq_2[skip_first_N:]

            if 'N' in the_seq:
                skipped += 1
                continue

            if the_seq in seq_set:
                candidate_count[candidate_by_seq[the_seq]] += 1

    finally:
        f1.close()
        f2.close()

    return candidate_count, candidates
Esempio n. 20
0
    def __init__(self,
                 filename=None,
                 mode="r",
                 *,
                 format=None,
                 check=-1,
                 preset=None,
                 filters=None):
        """Open an LZMA-compressed file in binary mode.

        filename can be either an actual file name (given as a str or
        bytes object), in which case the named file is opened, or it can
        be an existing file object to read from or write to.

        mode can be "r" for reading (default), "w" for (over)writing,
        "x" for creating exclusively, or "a" for appending. These can
        equivalently be given as "rb", "wb", "xb" and "ab" respectively.

        format specifies the container format to use for the file.
        If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
        default is FORMAT_XZ.

        check specifies the integrity check to use. This argument can
        only be used when opening a file for writing. For FORMAT_XZ,
        the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
        support integrity checks - for these formats, check must be
        omitted, or be CHECK_NONE.

        When opening a file for reading, the *preset* argument is not
        meaningful, and should be omitted. The *filters* argument should
        also be omitted, except when format is FORMAT_RAW (in which case
        it is required).

        When opening a file for writing, the settings used by the
        compressor can be specified either as a preset compression
        level (with the *preset* argument), or in detail as a custom
        filter chain (with the *filters* argument). For FORMAT_XZ and
        FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
        level. For FORMAT_RAW, the caller must always specify a filter
        chain; the raw compressor does not support preset compression
        levels.

        preset (if provided) should be an integer in the range 0-9,
        optionally OR-ed with the constant PRESET_EXTREME.

        filters (if provided) should be a sequence of dicts. Each dict
        should have an entry for "id" indicating ID of the filter, plus
        additional entries for options to the filter.
        """
        self._fp = None
        self._closefp = False
        self._mode = _MODE_CLOSED

        if mode in ("r", "rb"):
            if check != -1:
                raise ValueError("Cannot specify an integrity check "
                                 "when opening a file for reading")
            if preset is not None:
                raise ValueError("Cannot specify a preset compression "
                                 "level when opening a file for reading")
            if format is None:
                format = FORMAT_AUTO
            mode_code = _MODE_READ
        elif mode in ("w", "wb", "a", "ab", "x", "xb"):
            if format is None:
                format = FORMAT_XZ
            mode_code = _MODE_WRITE
            self._compressor = LZMACompressor(format=format,
                                              check=check,
                                              preset=preset,
                                              filters=filters)
            self._pos = 0
        else:
            raise ValueError("Invalid mode: {!r}".format(mode))

        if isinstance(filename, (str, bytes)):
            if "b" not in mode:
                mode += "b"
            self._fp = builtins.open(filename, mode)
            self._closefp = True
            self._mode = mode_code
        elif hasattr(filename, "read") or hasattr(filename, "write"):
            self._fp = filename
            self._mode = mode_code
        else:
            raise TypeError(
                "filename must be a str or bytes object, or a file")

        if self._mode == _MODE_READ:
            raw = _compression.DecompressReader(self._fp,
                                                LZMADecompressor,
                                                trailing_error=LZMAError,
                                                format=format,
                                                filters=filters)
            self._buffer = io.BufferedReader(raw)
Esempio n. 21
0
def pipeline_test(generator, expected, command, workers=1, sources=1,
                  mode='framed', sinks=1, decoder=None, pre_processor=None,
                  batch_size=1, sink_expect=None, sink_expect_allow_more=False,
                  sink_stop_timeout=DEFAULT_SINK_STOP_TIMEOUT,
                  sink_await=None, delay=30,
                  validate_file=None, giles_mode=False,
                  host='127.0.0.1', listen_attempts=1,
                  ready_timeout=30,
                  runner_join_timeout=DEFAULT_RUNNER_JOIN_TIMEOUT,
                  resilience_dir=None,
                  spikes={},
                  persistent_data={}):
    """
    Run a pipeline test without having to instrument everything
    yourself. This only works for 1-source, 1-sink topologies.

    Parameters:
    - `generator`: either a single data generator to use in a Sender's Reader,
        or a list of tuples of (generator, source_index) for use with
        multi-source applications. In the latter case, the senders are run
        sequentially, and the index is 0-based against the input addresses.
        the values in this set should be either strings or stringable. If they
        are custom data structures, they should already be encoded as strings.
    - `expectd`: the expect output set, to be compared against the received
        output. The data should be directly comparable to the decoded output.
    - `command`: the command to run each worker. Make sure to leave out the
        Wallaroo parameters: `--in`, `--out`, `--metrics`, `--data`,
        `--control`, `--external`, `--workers`, `--name`,
        `--cluster-initializer`, and `--ponynoblock`.
        These will be applied by the test setup utility.
    - `workers`: the number of workers to use in the test. Default: 1.
    - `sources`: the number of sources in the application. Default: 1.
    - `mode`: the decoding mode to use in the sink. Can be `'framed'` or
        `'newlines'`. Default: `'framed'`
    - `sinks`: the number of sinks to set up for the application. Default: 1.
    - `decoder`: an optional decoder to use for decoding the data from the
        sink. Default: None, assume data is strings.
    - `pre_processor`: an optional pre-processor to apply to the entire
        output set before comparing it against the expected data set.
        Default: None, assume output data is directly comparable.
    - `batch_size`: the batch size to use in the sender. Default: 1
    - `sink_expect`: the number of messages to expect at the sink. This allows
        directly relying on received output for timing control. Default: None
        Should be a list of `len(sinks)`.
    - `sink_expect_allow_more`: Bool (default False): allow more messages in sink
        after `sink_expect` values have been received.
    - `sink_await`: a list of (binary) strings to await for at the sink.
        Once all of the await values have been seen at the sink, the test may
        be stopped.
    - `sink_stop_timeout`: the timeout in seconds to use when awaiting an
        expected number of messages at the sink. Raise an error if timeout
        elapses. Default: 30
        Can be a number or a list of numbers of `len(sinks)`.
    - `delay`: Wait for `delay` seconds before stopping the cluster.
      Default 30 seconds. Only used if `sink_expect` and `sink_await`
      are both `None`.
    - `validate_file`: save sink data to a file to be validated by an external
        process.
    - `giles_mode`: if True, include a 64-bit timestamp between the length
        header and the payload when saving sink data to file. This is a
        backward compatibility mode for validators that expected
        giles-receiver format.
    - `host`: the network host address to use in workers, senders, and
        receivers. Default '127.0.0.1'
    - `listen_attempts`: attempt to start an applicatin listening on ports
        that are provided by the system. After `listen_attempts` fail, raise
        an appropriate error. For tests that experience TCP_WAIT related
        errors, this value should be set higher than 1.
        Default 1.
    - `ready_timeout`: number of seconds before an error is raised if the
        application does not report as ready. Default 30
    - `runner_join_timeout`: the timeout in seconds to use when waiting for
      the runners to exit cleanly. If the timeout is exceeded, the runners
      are killed and an error is raised.
    - `resilience_dir`: The directory where resilience file are kept. This
        path will be cleaned up before and after each run.
    - `spikes`: A dict of 3-tuples with the worker index as its key, and
        the spike parameters (probability, margin, seed) as its value.

    `expected` and the processed sink(s) data should be directly equatable.
    The test fails if they fail an equality assertion.
    If multiple sinks are used, then expected should match the flattened
    list of procssed sinks` data.
    e.g. if there are 2 sinks with the data [1,1,1] and [2,2,2] respectively,
    then expected should be [1,1,1,2,2,2].
    """
    try:
        if sink_expect is not None:
            if not isinstance(sink_expect, (list, tuple)):
                sink_expect = [sink_expect for x in range(sinks)]
            elif len(sink_expect) != sinks:  # list/tuple, but wrong length
                if len(sink_expect) == 1:
                    sink_expect = sink_expect * sinks
                else: # throw error, we don't know how to handle this
                    raise ValueError("sink_expect must be either an integer "
                        "or a list of integers whose length is the same as "
                        "the number of sinks. Got {}."
                        .format(sink_expect))
        elif sink_await is not None:
            if len(sink_await) != sinks:
                sink_await = [sink_await[:] for x in range(sinks)]

        # Start cluster
        with Cluster(command=command, host=host, sources=sources,
                     workers=workers, sinks=sinks, sink_mode=mode,
                     worker_join_timeout=runner_join_timeout,
                     is_ready_timeout = ready_timeout,
                     res_dir=resilience_dir,
                     persistent_data=persistent_data) as cluster:

            # Create senders
            if generator:
                if not isinstance(generator, list):
                    generator = [(generator, 0)]
                for gen, idx in generator:
                    reader = Reader(gen)
                    sender = Sender(cluster.source_addrs[idx], reader,
                                    batch_size=batch_size)
                    cluster.add_sender(sender)

            # start each sender and await its completion before starting the next
            if cluster.senders:
                for sender in cluster.senders:
                    sender.start()
                    sender.join()
                    try:
                        assert(sender.error is None)
                    except Exception as err:
                        logging.error("Sender exited with an error")
                        raise sender.error
                logging.debug('All senders completed sending.')
            else:
                logging.debug("No external senders were given for the cluster.")
            # Use sink, metrics, or a timer to determine when to stop the
            # runners and sinks and begin validation
            if sink_expect:
                logging.debug('Waiting for {} messages at the sinks with a timeout'
                              ' of {} seconds'.format(sink_expect,
                                                      sink_stop_timeout))
                for sink, sink_expect_val in zip(cluster.sinks, sink_expect):
                    logging.debug("SinkExpect on {} for {} msgs".format(sink, sink_expect_val))
                    cluster.sink_expect(expected=sink_expect_val,
                                        timeout=sink_stop_timeout,
                                        sink=sink,
                                        allow_more=sink_expect_allow_more)
            elif sink_await:
                logging.debug('Awaiting {} values at the sinks with a timeout of '
                              '{} seconds'.format(sum(map(len, sink_await)),
                                                  sink_stop_timeout))
                for sink, sink_await_vals in zip(cluster.sinks, sink_await):
                    cluster.sink_await(values=sink_await_vals,
                                       timeout=sink_stop_timeout,
                                       sink=sink)
            else:
                logging.debug('Waiting {} seconds before shutting down '
                              'cluster.'
                              .format(delay))
                time.sleep(delay)

            # join stoppers and check for errors
            cluster.stop_cluster()

            ############
            # Validation
            ############
            if validate_file:
                validation_files = validate_file.split(',')
                for sink, fp in zip(cluster.sinks, validation_files):
                    sink.save(fp, giles_mode)
                # let the code after 'finally' return our data

            else:  # compare expected to processed
                logging.debug('Begin validation phase...')
                # Decode captured output from sink
                if decoder:
                    if not isinstance(decoder, (list, tuple)):
                        decoder = [decoder for s in cluster.sinks]
                    decoded = []
                    for sink, decoder in zip(cluster.sinks, decoder):
                        decoded.append([])
                        for item in sink.data:
                            decoded[-1].append(decoder(item))
                else:
                    decoded = [sink.data for sink in cluster.sinks]

                if pre_processor:
                    processed = pre_processor(decoded)
                else:
                    processed = decoded

                # Validate captured output against expected output
                if isinstance(expected, basestring):
                    expected = io.BufferedReader(io.BytesIO(expected))
                if hasattr(expected, 'read') and hasattr(expected, 'tell'):
                    if isinstance(processed, list):
                        bytesio = io.BytesIO()
                        for part in processed:
                            for p in part:
                                bytesio.write(p)
                        bytesio.seek(0)
                        processed = io.BufferedReader(bytesio)
                    elif isinstance(processed, basestring):
                        processed = io.BufferedReader(io.BytesIO(processed))
                    # compare 50 bytes at a time
                    while True:
                        start_block = expected.tell()
                        proc = processed.read(50)
                        exp = expected.read(50)
                        if not proc and not exp:
                            break
                        try:
                            assert(exp == proc)
                        except:
                            raise AssertionError("Validation failed in bytes {}:{}"
                                                 " of expected file. Expected {!r}"
                                                 " but received {!r}.".format(
                                                     start_block,
                                                     expected.tell(),
                                                     exp,
                                                     proc))
                else:
                    flattened = list(itertools.chain.from_iterable(processed))
                    if mode == 'newlines':
                        # add newlines to expected
                        expected = [e + b'\n' for e in expected]
                    try:
                        assert(expected == flattened)
                    except:
                        raise AssertionError("Validation failed. Expected {!r} but"
                                             " received {!r}".format(expected,
                                                                     processed))
    except:
        logging.error("Integration pipeline_test encountered an error")
        logging.error("The last 10 lines of each worker were:\n\n{}".format(
            runner_data_format(persistent_data.get('runner_data', []),
                               from_tail=FROM_TAIL)))
        raise

    # Return runner names and outputs if try block didn't have a return
    return
Esempio n. 22
0
def clash_results(config, bot, update, args):
    send_uploading_photo_action(bot, update)

    username = update.message.from_user.username
    clash_ids = []
    results = {}

    if args:
        clash_ids = (list(set(args)))
    else:
        last_id = get_last_game(config, username, update.message.chat_id)["clash_id"]
        if last_id:
            clash_ids = [last_id]

    if not clash_ids:
        clash_results_usage(config, bot, update)
        return

    for clash_id in clash_ids:
        r = requests.post('https://www.codingame.com/services/ClashOfCodeRemoteService/findClashReportInfoByHandle',
                          headers={"content-type":"application/json;charset=UTF-8"},
                          data='[{}]'.format(clash_id))
        if r.status_code == 200:
            results = json.loads(r.text)
            if "success" in results and results["success"]:
                leaderboard = []
                clash_mode = results["success"]["mode"].capitalize() if "mode" in results["success"] else "Unknown"
                message = '''
                Game id: {clash_id}
                Game mode: {clash_mode}
                Status: {clash_status}
                Creation time: {clash_creation_time}

                '''.format(
                    clash_id=clash_id,
                    clash_mode=clash_mode,
                    clash_creation_time=results["success"]["creationTime"],
                    clash_status="Finished" if results["success"]["finished"] else "In progress")
                if clash_mode != "Unknown":
                    headers=["", "Username", "Language", "Score", "Time"]
                    if clash_mode == "Shortest":
                        headers.append("Characters")
                    for player in results["success"]["players"]:
                        cache = []
                        cache.insert(0, player["rank"] if "rank" in player else 0)
                        cache.insert(1, player["codingamerNickname"] if "codingamerNickname" in player else "Unknown")
                        cache.insert(2, player["languageId"] if "languageId" in player else "Unknown")
                        cache.insert(3, '{}%'.format(player["score"] if "score" in player else "0"))
                        cache.insert(4, str(datetime.timedelta(milliseconds=player["duration"] if "duration" in player else 0)).split('.', 2)[0])
                        if clash_mode == "Shortest":
                            cache.insert(5, player["criterion"] if "criterion" in player else 0)

                        leaderboard.insert(player["rank"] if "rank" in player else 0, cache)

                    message += tabulate(sorted(leaderboard),
                                        headers,
                                        tablefmt='psql')
                message += "\n"
                message = "\n".join([i.strip() for i in message.split('\n')])

                img_byte_arr = clash_results_to_byte_arr(message)

                bot.sendPhoto(chat_id=update.message.chat_id,
                              photo=io.BufferedReader(img_byte_arr),
                              caption='https://www.codingame.com/clashofcode/clash/report/{}'.format(
                                       clash_id))

    log_print("Results",
              chat_id=update.message.chat_id,
              username=username,
              clash_id=clash_id,
              level="INFO",
              command="clash_results")
Esempio n. 23
0
def gunzip(fileobj):
    is_gzipped = fileobj.read(2) == b'\037\213'
    fileobj.seek(-2, os.SEEK_CUR)
    if is_gzipped:
        fileobj = io.BufferedReader(gzip.GzipFile(fileobj=fileobj))
    return fileobj
Esempio n. 24
0
    def merge(self, git_repo_url, hg_repo_url, branch=None):
        # Eventually we'll want to handle a full merge, but for now, we only
        # handle the case where we don't have metadata to begin with.
        # The caller should avoid calling this function otherwise.
        assert not self._has_metadata
        remote_refs = OrderedDict()
        for line in Git.iter('ls-remote', fsdecode(git_repo_url),
                             stderr=open(os.devnull, 'wb')):
            sha1, ref = line.split(None, 1)
            remote_refs[ref] = sha1
        bundle = None
        if not remote_refs and urlparse(git_repo_url).scheme in (b'http',
                                                                 b'https'):
            try:
                bundle = HTTPReader(git_repo_url)
            except URLError as e:
                logging.error(e.reason)
                return False
            BUNDLE_SIGNATURE = b'# v2 git bundle\n'
            signature = bundle.read(len(BUNDLE_SIGNATURE))
            if signature != BUNDLE_SIGNATURE:
                logging.error('Could not find cinnabar metadata')
                return False
            bundle = io.BufferedReader(bundle)
            while True:
                line = bundle.readline().rstrip()
                if not line:
                    break
                sha1, ref = line.split(b' ', 1)
                remote_refs[ref] = sha1
        if branch:
            branches = [branch]
        else:
            branches = self._try_merge_branches(hg_repo_url)

        ref = self._find_branch(branches, remote_refs)
        if ref is None:
            logging.error('Could not find cinnabar metadata')
            return False

        if bundle:
            args = ('-v',) if util.progress else ()
            proc = GitProcess('index-pack', '--stdin', '--fix-thin', *args,
                              stdin=subprocess.PIPE,
                              stdout=open(os.devnull, 'wb'))
            shutil.copyfileobj(bundle, proc.stdin)
        else:
            fetch = ['fetch', '--no-tags', '--no-recurse-submodules', '-q']
            fetch.append('--progress' if util.progress else '--no-progress')
            fetch.append(fsdecode(git_repo_url))
            cmd = fetch + [fsdecode(ref) + ':refs/cinnabar/fetch']
            proc = GitProcess(*cmd, stdout=sys.stdout)
        if proc.wait():
            logging.error('Failed to fetch cinnabar metadata.')
            return False

        # Do some basic validation on the metadata we just got.
        commit = GitCommit(remote_refs[ref])
        if b'cinnabar@git' not in commit.author:
            logging.error('Invalid cinnabar metadata.')
            return False

        flags = set(commit.body.split())
        if b'files-meta' not in flags or b'unified-manifests-v2' not in flags \
                or len(commit.parents) != len(self.METADATA_REFS):
            logging.error('Invalid cinnabar metadata.')
            return False

        # At this point, we'll just assume this is good enough.

        # Get replace refs.
        if commit.tree != EMPTY_TREE:
            errors = False
            by_sha1 = {}
            for k, v in util.iteritems(remote_refs):
                if v not in by_sha1:
                    by_sha1[v] = k
            needed = []
            for line in Git.ls_tree(commit.tree):
                mode, typ, sha1, path = line
                if sha1 in by_sha1:
                    ref = b'refs/cinnabar/replace/%s' % path
                    if bundle:
                        Git.update_ref(ref, sha1)
                    else:
                        needed.append(
                            fsdecode(b':'.join((by_sha1[sha1], ref))))
                else:
                    logging.error('Missing commit: %s', sha1)
                    errors = True
            if errors:
                return False

            if not bundle:
                cmd = fetch + needed
                proc = GitProcess(*cmd, stdout=sys.stdout)
                if proc.wait():
                    logging.error('Failed to fetch cinnabar metadata.')
                    return False

        Git.update_ref(b'refs/cinnabar/metadata', commit.sha1)
        self._metadata_sha1 = commit.sha1
        GitHgHelper.reload()
        Git.delete_ref(b'refs/cinnabar/fetch')

        # TODO: avoid the duplication of code with __init__
        metadata = self.metadata()

        if not metadata:
            # This should never happen, but just in case.
            logging.warn('Could not find cinnabar metadata')
            Git.delete_ref(b'refs/cinnabar/metadata')
            GitHgHelper.reload()
            return False

        metadata, refs = metadata
        self._has_metadata = True
        self._metadata_refs = refs if metadata else {}
        changesets_ref = self._metadata_refs.get(b'refs/cinnabar/changesets')
        self._generation = 0
        if changesets_ref:
            commit = GitCommit(changesets_ref)
            for n, head in enumerate(commit.body.splitlines()):
                hghead, branch = head.split(b' ', 1)
                self._hgheads._previous[hghead] = (branch, 1)
                self._generation = n + 1

        self._manifest_heads_orig = set(GitHgHelper.heads(b'manifests'))

        for line in Git.ls_tree(metadata.tree):
            mode, typ, sha1, path = line
            self._replace[path] = sha1

        return True
Esempio n. 25
0
def convert_and_filter_topk(args):
    """ Convert to lowercase, count word occurrences and save top-k words to a file """

    counter = Counter()
    data_lower = os.path.join(args.output_dir, "lower.txt.gz")

    print("\nConverting to lowercase and counting word occurrences ...")
    with io.TextIOWrapper(
        io.BufferedWriter(gzip.open(data_lower, "w+")), encoding="utf-8"
    ) as file_out:

        # Open the input file either from input.txt or input.txt.gz
        _, file_extension = os.path.splitext(args.input_txt)
        if file_extension == ".gz":
            file_in = io.TextIOWrapper(
                io.BufferedReader(gzip.open(args.input_txt)), encoding="utf-8"
            )
        else:
            file_in = open(args.input_txt, encoding="utf-8")

        for line in progressbar.progressbar(file_in):
            line_lower = line.lower()
            counter.update(line_lower.split())
            file_out.write(line_lower)

        file_in.close()

    # Save top-k words
    print("\nSaving top {} words ...".format(args.top_k))
    top_counter = counter.most_common(args.top_k)
    vocab_str = "\n".join(word for word, count in top_counter)
    vocab_path = "vocab-{}.txt".format(args.top_k)
    vocab_path = os.path.join(args.output_dir, vocab_path)
    with open(vocab_path, "w+") as file:
        file.write(vocab_str)

    print("\nCalculating word statistics ...")
    total_words = sum(counter.values())
    print("  Your text file has {} words in total".format(total_words))
    print("  It has {} unique words".format(len(counter)))
    top_words_sum = sum(count for word, count in top_counter)
    word_fraction = (top_words_sum / total_words) * 100
    print(
        "  Your top-{} words are {:.4f} percent of all words".format(
            args.top_k, word_fraction
        )
    )
    print('  Your most common word "{}" occurred {} times'.format(*top_counter[0]))
    last_word, last_count = top_counter[-1]
    print(
        '  The least common word in your top-k is "{}" with {} times'.format(
            last_word, last_count
        )
    )
    for i, (w, c) in enumerate(reversed(top_counter)):
        if c > last_count:
            print(
                '  The first word with {} occurrences is "{}" at place {}'.format(
                    c, w, len(top_counter) - 1 - i
                )
            )
            break

    return data_lower, vocab_str
Esempio n. 26
0
 def load_from(hf: h5py.File, name: str) -> Any:
     with io.BufferedReader(DatasetIO(hf[name])) as bf:
         return torch.load(bf)
Esempio n. 27
0
 def gzip_open_encoded(file, encoding=None):
     return io.TextIOWrapper(io.BufferedReader(gzip.open(file)),
                             encoding="utf8")
Esempio n. 28
0
import io
"""
    Um buffer que fornece acesso de nível superior a um objeto RawIOBase sequencial e legível. 
    Ele herda o BufferedIOBase. Ao ler dados desse objeto, uma quantidade maior de dados pode ser 
    solicitada no fluxo bruto subjacente e mantida em um buffer interno.  Os dados em buffer podem 
    ser retornados diretamente nas leituras subsequentes.
    
    O construtor cria um BufferedReader para o fluxo bruto legível fornecido e buffer_size. 
    Se buffer_size for omitido, DEFAULT_BUFFER_SIZE será usado.
"""

bio = io.BytesIO(b'Luiz Filipy - Brasil 1234')

br = io.BufferedReader(bio)

# peek(): Retorna bytes do fluxo sem avançar a posição. No máximo, uma única leitura no fluxo bruto
# é feita para satisfazer a chamada. O número de bytes retornados pode ser menor ou maior que o solicitado.
print(br.peek())

# read(): Lê e retorna bytes da tamanho fornecido, ou se o tamanho não for fornecido ou negativo,
# até o EOF ou se a chamada de leitura bloquearia no modo sem bloqueio.
print(br.read(11))  # retorno: b'Luiz Filipy'

# read1(): Lâ e retorna o tamanho de bytes com apenas uma chamada no fluxo bruto.
# Se pelo menos um byte for armazenado em buffer, somente bytes em buffer serão retornados.
# Caso contrário, é feita uma chamada de leitura de fluxo bruto.
print(br.read1())  # retorno: b' - Brasil 1234'
Esempio n. 29
0
def _wrap_reader_for_text(fp, encoding):
    if isinstance(fp.read(0), bytes):
        fp = io.TextIOWrapper(io.BufferedReader(fp), encoding)
    return fp
Esempio n. 30
0
    def parse(self, response):
        filename = response.meta.get('filename')
        # os.makedirs('./maine/', exist_ok=True)
        # with open('./maine/' + filename.replace('/', '-'), 'wb') as f:
            # f.write(response.body)

        # yield MaineItem(bill=response.meta.get('bill'), filename=filename.replace('/', '-'), name=response.meta.get('name'), url=response.url )
        bill = '' #response.meta.get('bill')

        session = '127th'

        state = 'maine'

        bill_name = ''

        md5 = hashlib.md5(response.body).hexdigest()

        html = ''

        url = response.url

        date = ''

        chamber = 'House & Senate'

        # for i in chamber:
        #     if i in bill_name:
        #         chamber = chamber.get(i)
        #         break

        topic = '#TODO'

        # text from pdf
        bytesio = io.BytesIO(response.body)
        bfr = io.BufferedReader(bytesio)
        pdf_text = convert_pdf_to_txt(bfr) if response.url.strip()[-4:].lower() == '.pdf' else 'unsupported file'

        # recognized text (OCR) from pdf
        if len(pdf_text.strip()) <= 50:
            with wi(filename=response.url, resolution=200) as pdf:
                pdfImage = pdf.convert('jpeg')
                imageBlobs = []
                for img in pdfImage.sequence:
                    with wi(image = img) as imgPage:
                        imageBlobs.append(imgPage.make_blob('jpeg'))
                        
            recognized_text = []

            for imgBlob in imageBlobs:
                im = Image.open(io.BytesIO(imgBlob))
                text = pytesseract.image_to_string(im, lang = 'eng')
                recognized_text.append(text)

            recognized_text = '\n\n\n'.join(recognized_text)

        pdf_text = pdf_text if len(pdf_text.strip()) > 50 else recognized_text

        yield MaineItem(md5=md5, html=html,
                        session=session, bill_name=bill_name,
                        url=url, state=state,
                        date=date, chamber=chamber,
                        topic=topic, text=pdf_text)