def main(): ''' do the thing ''' parser = MyParser( description="script name - script description") group = parser.add_mutually_exclusive_group() group.add_argument("-f", "--f5f", help="File list of fast5 paths") group.add_argument("-p", "--f5_path", help="Fast5 top dir") group.add_argument("-s", "--signal", help="Extracted signal file from SquigglePull") group.add_argument("-i", "--ind", help="Individual fast5 file") parser.add_argument("--head", action="store_true", help="Header present in signal or flat file") parser.add_argument("-n", "--Num", help="Section of signal to look at - -n 2000 or -n 100,1500") parser.add_argument("--scale_hi", type=int, default=1200, help="Upper limit for signal outlier scaling") parser.add_argument("--scale_low", type=int, default=0, help="Lower limit for signal outlier scaling") # Arguments for now, but best way forward will probably be a config file parser.add_argument("--plot_colour", default='grey', help="Colour of signal plot, takes any pyplot entry: k,r,b,g,red,blue,etc...") parser.add_argument("--save", help="Save file readname_saveArg.pdf --save saveArg.pdf, use png, etc for other file types") parser.add_argument("--save_path", help="Save filepath") parser.add_argument("--no_show", action="store_true", help="Do not show plot (used for saving many)") parser.add_argument("--dpi", type=int, default=100, help="Change DPI for publication figs, eg: --dpi 300") args = parser.parse_args() # print help if no arguments given if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) matplotlib.rcParams['savefig.dpi'] = args.dpi N = 0 N1 = 0 N2 = 0 if args.Num: if ',' in args.Num: N1, N2 = args.Num.split(',') N1, N2 = int(N1), int(N2) else: N = int(args.Num) head = False if args.head: head = True if args.f5f: # file list of fast5 files. # fast5_name\tquality_score # not using the second column atm if args.f5f.endswith('.gz'): f_read = dicSwitch('gz') else: f_read = dicSwitch('norm') with f_read(args.f5f, 'rb') as sz: if args.f5f.endswith('.gz'): sz = io.BufferedReader(sz) for l in sz: if head: head = False continue l = l.strip('\n') l = l.split('\t')[0] path = l l = l.split('/') fast5 = l[-1] sig = process_fast5(path) if not sig: continue if N: sig = sig[:N] elif N1 or N2: sig = sig[N1:N2] sig = scale_outliers(sig, args) # output sections view_sig(args, sig, fast5) elif args.f5_path: # process fast5 files given top level path for dirpath, dirnames, files in os.walk(args.f5_path): for fast5 in files: if fast5.endswith('.fast5'): fast5_file = os.path.join(dirpath, fast5) # extract data from file sig = process_fast5(fast5_file) if not sig: print >> sys.stderr, "main():data not extracted. Moving to next file", fast5_file continue if N: sig = sig[:N] elif N1 or N2: sig = sig[N1:N2] sig = np.array(sig, dtype=int) sig = scale_outliers(sig, args) view_sig(args, sig, fast5) elif args.signal: # signal file, gzipped, from squigglepull # testing if args.signal.endswith('.gz'): f_read = dicSwitch('gz') else: f_read = dicSwitch('norm') with f_read(args.signal, 'rb') as sz: if args.signal.endswith('.gz'): sz = io.BufferedReader(sz) for l in sz: if head: head = False continue l = l.strip('\n') l = l.split('\t') fast5 = l[0] # modify the l[6:] to the column the data starts...little bit of variability here. sig = np.array([int(i) for i in l[4:]], dtype=int) if not sig.any(): print >> sys.stderr, "nope 1" continue if N: sig = sig[:N] elif N1 or N2: sig = sig[N1:N2] sig = scale_outliers(sig, args) view_sig(args, sig, fast5) elif args.ind: # Do an OS detection here for windows (get from fast5_fetcher) fast5 = args.ind.split('/')[-1] # extract data from file sig = process_fast5(args.ind) if not sig: print >> sys.stderr, "main():data not extracted.", args.ind parser.print_help(sys.stderr) sys.exit(1) if N: sig = sig[:N] elif N1 or N2: sig = sig[N1:N2] sig = np.array(sig, dtype=int) sig = scale_outliers(sig, args) view_sig(args, sig, fast5) else: print >> sys.stderr, "Unknown file or path input" parser.print_help(sys.stderr) sys.exit(1) print >> sys.stderr, "Done"
def get_readable_fileobj(name_or_obj, encoding=None, cache=False, show_progress=True, remote_timeout=None): """ Given a filename, pathlib.Path object or a readable file-like object, return a context manager that yields a readable file-like object. This supports passing filenames, URLs, and readable file-like objects, any of which can be compressed in gzip, bzip2 or lzma (xz) if the appropriate compression libraries are provided by the Python installation. Notes ----- This function is a context manager, and should be used for example as:: with get_readable_fileobj('file.dat') as f: contents = f.read() Parameters ---------- name_or_obj : str or file-like object The filename of the file to access (if given as a string), or the file-like object to access. If a file-like object, it must be opened in binary mode. encoding : str, optional When `None` (default), returns a file-like object with a ``read`` method that on Python 2.x returns `bytes` objects and on Python 3.x returns `str` (``unicode``) objects, using `locale.getpreferredencoding` as an encoding. This matches the default behavior of the built-in `open` when no ``mode`` argument is provided. When ``'binary'``, returns a file-like object where its ``read`` method returns `bytes` objects. When another string, it is the name of an encoding, and the file-like object's ``read`` method will return `str` (``unicode``) objects, decoded from binary using the given encoding. cache : bool, optional Whether to cache the contents of remote URLs. show_progress : bool, optional Whether to display a progress bar if the file is downloaded from a remote server. Default is `True`. remote_timeout : float Timeout for remote requests in seconds (default is the configurable `astropy.utils.data.Conf.remote_timeout`, which is 3s by default) Returns ------- file : readable file-like object """ # close_fds is a list of file handles created by this function # that need to be closed. We don't want to always just close the # returned file handle, because it may simply be the file handle # passed in. In that case it is not the responsibility of this # function to close it: doing so could result in a "double close" # and an "invalid file descriptor" exception. PATH_TYPES = (str, pathlib.Path) close_fds = [] delete_fds = [] if remote_timeout is None: # use configfile default remote_timeout = conf.remote_timeout # Get a file object to the content if isinstance(name_or_obj, PATH_TYPES): # name_or_obj could be a Path object if pathlib is available name_or_obj = str(name_or_obj) is_url = _is_url(name_or_obj) if is_url: name_or_obj = download_file(name_or_obj, cache=cache, show_progress=show_progress, timeout=remote_timeout) fileobj = io.FileIO(name_or_obj, 'r') if is_url and not cache: delete_fds.append(fileobj) close_fds.append(fileobj) else: fileobj = name_or_obj # Check if the file object supports random access, and if not, # then wrap it in a BytesIO buffer. It would be nicer to use a # BufferedReader to avoid reading loading the whole file first, # but that is not compatible with streams or urllib2.urlopen # objects on Python 2.x. if not hasattr(fileobj, 'seek'): fileobj = io.BytesIO(fileobj.read()) # Now read enough bytes to look at signature signature = fileobj.read(4) fileobj.seek(0) if signature[:3] == b'\x1f\x8b\x08': # gzip import struct try: import gzip fileobj_new = gzip.GzipFile(fileobj=fileobj, mode='rb') fileobj_new.read(1) # need to check that the file is really gzip except (IOError, EOFError): # invalid gzip file fileobj.seek(0) fileobj_new.close() except struct.error: # invalid gzip file on Python 3 fileobj.seek(0) fileobj_new.close() else: fileobj_new.seek(0) fileobj = fileobj_new elif signature[:3] == b'BZh': # bzip2 try: import bz2 except ImportError: for fd in close_fds: fd.close() raise ValueError( ".bz2 format files are not supported since the Python " "interpreter does not include the bz2 module") try: # bz2.BZ2File does not support file objects, only filenames, so we # need to write the data to a temporary file with NamedTemporaryFile("wb", delete=False) as tmp: tmp.write(fileobj.read()) tmp.close() fileobj_new = bz2.BZ2File(tmp.name, mode='rb') fileobj_new.read(1) # need to check that the file is really bzip2 except IOError: # invalid bzip2 file fileobj.seek(0) fileobj_new.close() # raise else: fileobj_new.seek(0) close_fds.append(fileobj_new) fileobj = fileobj_new elif signature[:3] == b'\xfd7z': # xz try: import lzma fileobj_new = lzma.LZMAFile(fileobj, mode='rb') fileobj_new.read(1) # need to check that the file is really xz except ImportError: for fd in close_fds: fd.close() raise ValueError( ".xz format files are not supported since the Python " "interpreter does not include the lzma module.") except (IOError, EOFError) as e: # invalid xz file fileobj.seek(0) fileobj_new.close() # should we propagate this to the caller to signal bad content? # raise ValueError(e) else: fileobj_new.seek(0) fileobj = fileobj_new # By this point, we have a file, io.FileIO, gzip.GzipFile, bz2.BZ2File # or lzma.LZMAFile instance opened in binary mode (that is, read # returns bytes). Now we need to, if requested, wrap it in a # io.TextIOWrapper so read will return unicode based on the # encoding parameter. needs_textio_wrapper = encoding != 'binary' if needs_textio_wrapper: # A bz2.BZ2File can not be wrapped by a TextIOWrapper, # so we decompress it to a temporary file and then # return a handle to that. try: import bz2 except ImportError: pass else: if isinstance(fileobj, bz2.BZ2File): tmp = NamedTemporaryFile("wb", delete=False) data = fileobj.read() tmp.write(data) tmp.close() delete_fds.append(tmp) fileobj = io.FileIO(tmp.name, 'r') close_fds.append(fileobj) fileobj = io.BufferedReader(fileobj) fileobj = io.TextIOWrapper(fileobj, encoding=encoding) # Ensure that file is at the start - io.FileIO will for # example not always be at the start: # >>> import io # >>> f = open('test.fits', 'rb') # >>> f.read(4) # 'SIMP' # >>> f.seek(0) # >>> fileobj = io.FileIO(f.fileno()) # >>> fileobj.tell() # 4096L fileobj.seek(0) try: yield fileobj finally: for fd in close_fds: fd.close() for fd in delete_fds: os.remove(fd.name)
def __init__(self, filename=None, mode=None, compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None): """Constructor for the GzipFile class. At least one of fileobj and filename must be given a non-trivial value. The new class instance is based on fileobj, which can be a regular file, an io.BytesIO object, or any other object which simulates a file. It defaults to None, in which case filename is opened to provide a file object. When fileobj is not None, the filename argument is only used to be included in the gzip file header, which may include the original filename of the uncompressed file. It defaults to the filename of fileobj, if discernible; otherwise, it defaults to the empty string, and in this case the original filename is not included in the header. The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or 'xb' depending on whether the file will be read or written. The default is the mode of fileobj if discernible; otherwise, the default is 'rb'. A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and 'wb', 'a' and 'ab', and 'x' and 'xb'. The compresslevel argument is an integer from 0 to 9 controlling the level of compression; 1 is fastest and produces the least compression, and 9 is slowest and produces the most compression. 0 is no compression at all. The default is 9. The mtime argument is an optional numeric timestamp to be written to the last modification time field in the stream when compressing. If omitted or None, the current time is used. """ if mode and ('t' in mode or 'U' in mode): raise ValueError("Invalid mode: {!r}".format(mode)) if mode and 'b' not in mode: mode += 'b' if fileobj is None: fileobj = self.myfileobj = builtins.open(filename, mode or 'rb') if filename is None: filename = getattr(fileobj, 'name', '') if not isinstance(filename, (str, bytes)): filename = '' else: filename = os.fspath(filename) origmode = mode if mode is None: mode = getattr(fileobj, 'mode', 'rb') if mode.startswith('r'): self.mode = READ raw = _GzipReader(fileobj) self._buffer = io.BufferedReader(raw) self.name = filename elif mode.startswith(('w', 'a', 'x')): if origmode is None: import warnings warnings.warn( "GzipFile was opened for writing, but this will " "change in future Python releases. " "Specify the mode argument for opening it for writing.", FutureWarning, 2) self.mode = WRITE self._init_write(filename) self.compress = zlib.compressobj(compresslevel, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, 0) self._write_mtime = mtime else: raise ValueError("Invalid mode: {!r}".format(mode)) self.fileobj = fileobj if self.mode == WRITE: self._write_gzip_header(compresslevel)
def openbin( self, path, # type: Text mode="r", # type: Text buffering=-1, # type: int **options # type: Any ): # type: (...) -> BinaryIO """Open a binary file-like object. Arguments: path (str): A path on the filesystem. mode (str): Mode to open file (must be a valid non-text mode, defaults to *r*). Since this method only opens binary files, the ``b`` in the mode string is implied. buffering (int): Buffering policy (-1 to use default buffering, 0 to disable buffering, or any positive integer to indicate a buffer size). **options: keyword arguments for any additional information required by the filesystem (if any). Returns: io.IOBase: a *file-like* object. Raises: fs.errors.FileExpected: If the path is not a file. fs.errors.FileExists: If the file exists, and *exclusive mode* is specified (``x`` in the mode). fs.errors.ResourceNotFound: If the path does not exist. """ _mode = Mode(mode) _mode.validate_bin() _path = self.validatepath(path) dir_path, file_name = split(_path) if not file_name: raise errors.FileExpected(path) with self._lock: _dir_res = self._getresource(dir_path) if not _dir_res or not _dir_res.is_collection: raise errors.ResourceNotFound(path) if _mode.create: if file_name in _dir_res.get_member_names(): if _mode.exclusive: raise errors.FileExists(path) _res = self._getresource(path) if not _res or _res.is_collection: raise errors.FileExpected(path) stream = io.BufferedWriter(_res.begin_write()) io_object = RawWrapper(stream, mode=mode, name=path) return io_object _res = _dir_res.create_empty_resource(file_name) stream = io.BufferedWriter(_res.begin_write()) io_object = RawWrapper(stream, mode=mode, name=path) return io_object if file_name not in _dir_res.get_member_names(): raise errors.ResourceNotFound(path) _res = self._getresource(path) if not _res or _res.is_collection: raise errors.FileExpected(path) if _mode.appending: # stream.seek(0, 2) # io.SEEK_END raise NotImplementedError("Appending is not supported") if _mode.updating: raise NotImplementedError("Updating is not supported") if _mode.reading: stream = io.BufferedReader(_res.get_content()) io_object = RawWrapper(stream, mode=mode, name=path) return io_object stream = io.BufferedWriter(_res.begin_write()) io_object = RawWrapper(stream, mode=mode, name=path) return io_object
def main(): # parse command line options parser = OptionParser() parser.add_option("--inFile", "-i", dest="input_file", default=None, help="Full path of input fastq file.") parser.add_option("--outFile", "-o", dest="out_file", default=None, help="Full path of output file.") parser.add_option("--numSample", "-n", dest="num_sample", default=None, help="Number of sampled reads.") parser.add_option("--ungzip", dest="num_sample", default=None, help="Number of sampled reads.") parser.add_option("--plainOut", action="store_true", dest="plain_out", default=False, help="Save plain text file for output, not gzip.") (options, args) = parser.parse_args() if len(sys.argv[1:]) == 0: print("Welcome to Fastq-Sample!\n") print("use -h or --help for help on argument.") sys.exit(1) if options.input_file is None: print("Error: need input fastq file.") sys.exit(1) else: input_file = options.input_file file_name = ".".join(os.path.basename(input_file).split(".")[:-1]) if options.num_sample is None: print("Error: need -n numSample for number of sampled reads.") sys.exit(1) else: num_sample = int(options.num_sample) if options.out_file is None: out_file = os.path.dirname( input_file) + "/random%d.fq.gz" % (num_sample) else: out_file = options.out_file START_TIME = time.time() # counting total reads total_reads = 0 try: infile = gzip.open(input_file, 'rb') with io.BufferedReader(infile) as f: for line in f: total_reads += 1 ftype = "gzip" except: infile = open(input_file, 'rb') for line in infile: total_reads += 1 ftype = "plain" infile.close() total_reads = total_reads / 4 sys.stdout.write('\r[Fastq-Sample] Sample %d out of %d reads.' % (num_sample, total_reads)) sys.stdout.flush() # generate random reads index idx_out = np.random.permutation(total_reads)[:num_sample] idx_out = np.sort(idx_out) ## print idx_out, len(idx_out) ## idx_out = np.arange(100) # output sampled reads if ftype == "gzip": infile = io.BufferedReader(gzip.open(input_file, 'rb')) else: infile = open(input_file, 'rb') if options.plain_out is True: outfile = open(out_file, "w") else: outfile = gzip.open(out_file, "w") outCNT, lineCNT = 0, -1 for line in infile: lineCNT += 1 if int(lineCNT / 4) == idx_out[outCNT]: outfile.writelines(line) if lineCNT % 4 == 3: outCNT += 1 if outCNT >= len(idx_out): break infile.close() outfile.close() run_time = time.time() - START_TIME sys.stdout.write( ('\r[Fastq-Sample] Sample %d out of %d reads. ' 'Done in %.2f sec.' % (num_sample, total_reads, run_time))) sys.stdout.flush() print("")
def __init__(self, filename, mode="r", *, compresslevel=9): """Open a bzip2-compressed file. If filename is a str, bytes, or PathLike object, it gives the name of the file to be opened. Otherwise, it should be a file object, which will be used to read or write the compressed data. mode can be 'r' for reading (default), 'w' for (over)writing, 'x' for creating exclusively, or 'a' for appending. These can equivalently be given as 'rb', 'wb', 'xb', and 'ab'. If mode is 'w', 'x' or 'a', compresslevel can be a number between 1 and 9 specifying the level of compression: 1 produces the least compression, and 9 (default) produces the most compression. If mode is 'r', the input file may be the concatenation of multiple compressed streams. """ # This lock must be recursive, so that BufferedIOBase's # writelines() does not deadlock. self._lock = RLock() self._fp = None self._closefp = False self._mode = _MODE_CLOSED if not (1 <= compresslevel <= 9): raise ValueError("compresslevel must be between 1 and 9") if mode in ("", "r", "rb"): mode = "rb" mode_code = _MODE_READ elif mode in ("w", "wb"): mode = "wb" mode_code = _MODE_WRITE self._compressor = BZ2Compressor(compresslevel) elif mode in ("x", "xb"): mode = "xb" mode_code = _MODE_WRITE self._compressor = BZ2Compressor(compresslevel) elif mode in ("a", "ab"): mode = "ab" mode_code = _MODE_WRITE self._compressor = BZ2Compressor(compresslevel) else: raise ValueError("Invalid mode: %r" % (mode, )) if isinstance(filename, (str, bytes, os.PathLike)): self._fp = _builtin_open(filename, mode) self._closefp = True self._mode = mode_code elif hasattr(filename, "read") or hasattr(filename, "write"): self._fp = filename self._mode = mode_code else: raise TypeError( "filename must be a str, bytes, file or PathLike object") if self._mode == _MODE_READ: raw = _compression.DecompressReader(self._fp, BZ2Decompressor, trailing_error=OSError) self._buffer = io.BufferedReader(raw) else: self._pos = 0
sample = dataModelMfccPhon() sample.loadFromCSV(reader) if not sample.isBegin(): dataSet.append(sample) print "Got " + str(len(dataSet)) + " samples" mfccCol = MfccCollection() mfccCol.loadFromMfccDict(sys.argv[3]) zf = zipfile.ZipFile(sys.argv[1]) model = model_from_json(zf.read("config.json")) model.compile(loss='categorical_crossentropy', optimizer='adam') model.set_weights(np.load(io.BufferedReader(zf.open("weight.npy", mode='r')))) header_list = list() sample_array = [] for elt in dataSet: sample_array.append(elt.getContextArray(mfccCol)) prediction = model.predict(np.asarray(sample_array)) output = csvWriter(sys.argv[4]) index_prediction = 0 for elt in dataSet: output.addLine((elt.infoRef[0], elt.good, prediction[index_prediction][0])) index_prediction += 1
def clean (self, data): data.reverse( ) self.data = self.eat_nulls(data) self.stream = io.BufferedReader(io.BytesIO(self.data))
def smart_open(filename): if filename.endswith('.gz'): return io.BufferedReader(gzip.open(filename)) return open(filename)
import sys import gzip import io print("chrBase" + "\t" + "chr" + "\t" + "base" + "\t" + "strand" + "\t" + "coverage" + "\t" + "freqC" + "\t" + "freqT") CGmap = gzip.open(sys.argv[1], 'rt') f = io.BufferedReader(CGmap) for call in CGmap: chr, strand, pos, type, dinucleotide, perc_C, mC, cov = call.strip().split( ) if dinucleotide == "CG": outlist = [] if strand == "C": strand = "F" else: strand = "R" perc_C = float(perc_C) * 100 perc_T = 100 - (perc_C) outlist.append(str("chr" + chr + "." + pos)) outlist.append(str("chr" + chr)) outlist.append(str(pos)) outlist.append(strand) outlist.append(str(cov)) outlist.append(str(perc_C)) outlist.append(str(perc_T)) print("\t".join(outlist))
def paired2single(fq1, fq2, barcodes, mismatch, fq, others, tso, polya, min_len): buffer_max = 100000000 mismatch = int(mismatch) # in case of str barcodes_mis_dict = mismatch_dict(barcodes, mismatch) tso_n = len(tso) out1 = io.BufferedWriter(gzip.open(fq, 'w'), buffer_size=buffer_max) out2 = io.BufferedWriter(gzip.open(others, 'w'), buffer_size=buffer_max) bbcount = dict( list( zip(barcodes, [[[0 for i in range(mismatch + 1)] for j in range(2)] for k in range(len(barcodes))]))) #bbcount['ambiguous'] = [0 for i in range(2)] bbcount['unmatched'] = [0 for j in range(2)] if len(fq1) == len(fq2): for i in range(0, len(fq1)): in1 = io.BufferedReader(gzip.open(fq1[i], 'rU'), buffer_size=buffer_max) in2 = io.BufferedReader(gzip.open(fq2[i], 'rU'), buffer_size=buffer_max) r1 = FastqGeneralIterator(in1) r2 = FastqGeneralIterator(in2) buffer_i = 0 for r in r2: rr = next(r1) tag = r[1][0:8] isbar_flag = False if tag in barcodes_mis_dict: bb = barcodes_mis_dict[tag] bbcount[bb[1]][0][bb[0]] += 1 isbar_flag = True else: bbcount['unmatched'][0] += 1 rr0 = rr[0] rr1 = rr[1] rr2 = rr[2] # trim tso and polya , and skip read less than 50nt ind_tso = 0 ind_polya = len(rr1) ind_flag = False if tso in rr1: # using `in` first to save time in case that most reads did not include tso or polya ind_tso = rr1.rfind( tso) + tso_n + 3 # sometimes GGG is at the end of tso ind_flag = True if polya in rr1: ind_polya = rr1.find(polya) ind_flag = True if ind_flag: if ((ind_polya - ind_tso) >= min_len): rr1 = rr1[ind_tso:ind_polya] rr2 = rr2[ind_tso:ind_polya] else: continue if isbar_flag: rr0 = bb[1] + r[1][8:16] + '_' + rr0 out1.write('@%s\n%s\n+\n%s\n' % (rr0, rr1, rr2)) bbcount[bb[1]][1][bb[0]] += 1 else: rr0 = r[1][0:16] + '_' + rr0 out2.write('@%s\n%s\n+\n%s\n' % (rr0, rr1, rr2)) bbcount['unmatched'][1] += 1 in1.close() in2.close() else: return None out1.flush() out2.flush() out1.close() out2.close() return bbcount
def __init__(self, **kwargs): other = kwargs.get("other", None) if other: self.fromOther(other) else: self.bufferPool = kwargs.get("bufferPool", None) if self.bufferPool is None: raise ValueError( "No buffer pool found when initializing a storage file") fileId = kwargs.get("fileId", None) filePath = kwargs.get("filePath", None) mode = kwargs.get("mode", None) existing = os.path.exists(filePath) if fileId and filePath: initHeader = False initFreePages = False if not existing and mode.lower() == "create": ioMode = "w+b" pageSize = kwargs.get("pageSize", io.DEFAULT_BUFFER_SIZE) pageClass = kwargs.get("pageClass", StorageFile.defaultPageClass) schema = kwargs.get("schema", None) if pageSize and pageClass and schema: self.header = FileHeader(pageSize=pageSize, pageClass=pageClass, schema=schema) initHeader = True initFreePages = False else: raise ValueError( "No page size, class or schema specified when creating a new storage file" ) elif existing and mode.lower() in ["update", "truncate"]: ioMode = "r+b" if mode.lower() == "update" else "w+b" f = io.BufferedReader(io.FileIO(filePath)) self.header = FileHeader.fromFile(f) pageSize = self.pageSize() initFreePages = True f.close() else: raise ValueError( "Incompatible storage file mode and on-disk file status" ) if self.header: self.fileId = fileId self.path = filePath self.file = io.BufferedRandom(io.FileIO(self.path, ioMode), buffer_size=pageSize) self.binrepr = Struct("H" + str(FileId.binrepr.size) + "s" + str(len(self.path)) + "s") self.freePages = set() page = self.pageClass()(pageId=self.pageId(0), buffer=bytes(self.pageSize()), schema=self.schema()) self.pageHdrSize = page.header.headerSize() if initFreePages: self.initializeFreePages() if initHeader: self.refreshFileHeader() else: raise ValueError( "No valid header available for storage file") else: raise ValueError( "No file id or path specified in storage file constructor")
def load_pyrnn_model(cls, path: str): """ Loads an pyrnn model to VGSL. """ if not PY2: raise KrakenInvalidModelException( 'Loading pickle models is not supported on python 3') import cPickle def find_global(mname, cname): aliases = { 'lstm.lstm': kraken.lib.lstm, 'ocrolib.lstm': kraken.lib.lstm, 'ocrolib.lineest': kraken.lib.lineest, } if mname in aliases: return getattr(aliases[mname], cname) return getattr(sys.modules[mname], cname) of = io.open if path.endswith(u'.gz'): of = gzip.open with io.BufferedReader(of(path, 'rb')) as fp: unpickler = cPickle.Unpickler(fp) unpickler.find_global = find_global try: net = unpickler.load() except Exception as e: raise KrakenInvalidModelException(str(e)) if not isinstance(net, kraken.lib.lstm.SeqRecognizer): raise KrakenInvalidModelException('Pickle is %s instead of ' 'SeqRecognizer' % type(net).__name__) # extract codec codec = PytorchCodec({k: [v] for k, v in net.codec.char2code.items()}) input = net.Ni parallel, softmax = net.lstm.nets fwdnet, revnet = parallel.nets revnet = revnet.net hidden = fwdnet.WGI.shape[0] # extract weights weightnames = ('WGI', 'WGF', 'WCI', 'WGO', 'WIP', 'WFP', 'WOP') fwd_w = [] rev_w = [] for w in weightnames: fwd_w.append(torch.Tensor(getattr(fwdnet, w))) rev_w.append(torch.Tensor(getattr(revnet, w))) t = torch.cat(fwd_w[:4]) weight_ih_l0 = t[:, :input + 1] weight_hh_l0 = t[:, input + 1:] t = torch.cat(rev_w[:4]) weight_ih_l0_rev = t[:, :input + 1] weight_hh_l0_rev = t[:, input + 1:] weight_lin = torch.Tensor(softmax.W2) # build vgsl spec and set weights nn = cls('[1,1,0,{} Lbxo{} O1ca{}]'.format(input, hidden, len(net.codec.code2char))) nn.nn.L_0.layer.weight_ih_l0 = torch.nn.Parameter(weight_ih_l0) nn.nn.L_0.layer.weight_hh_l0 = torch.nn.Parameter(weight_hh_l0) nn.nn.L_0.layer.weight_ih_l0_reverse = torch.nn.Parameter( weight_ih_l0_rev) nn.nn.L_0.layer.weight_hh_l0_reverse = torch.nn.Parameter( weight_hh_l0_rev) nn.nn.L_0.layer.weight_ip_l0 = torch.nn.Parameter(fwd_w[4]) nn.nn.L_0.layer.weight_fp_l0 = torch.nn.Parameter(fwd_w[5]) nn.nn.L_0.layer.weight_op_l0 = torch.nn.Parameter(fwd_w[6]) nn.nn.L_0.layer.weight_ip_l0_reverse = torch.nn.Parameter(rev_w[4]) nn.nn.L_0.layer.weight_fp_l0_reverse = torch.nn.Parameter(rev_w[5]) nn.nn.L_0.layer.weight_op_l0_reverse = torch.nn.Parameter(rev_w[6]) nn.nn.O_1.lin.weight = torch.nn.Parameter(weight_lin) nn.add_codec(codec) return nn
out_string = ' '.join(curr_entry) for i, mem in enumerate(curr_entry_mem): out_string += ' ' + ' '.join([mem, curr_entry_mem_addr[i]]) return out_string if isa == 'x86': start_recording = False stop_recording = False in_file_dump_micro = in_file_base + '_dump_micro.gz' in_file_mem_dump = in_file_base + '_mem_dump.gz' # TODO: rename file for simplified trace (too verbose) out_filename = app_prefix + '_clean_dump_parsed_merged.txt' dis_list = gzip.open(in_file_dump_micro) dis_io = io.BufferedReader(dis_list) mem_list = gzip.open(in_file_mem_dump) mem_io = io.BufferedReader(mem_list) pc = None prev_pc = pc curr_entry = [] curr_entry_mem = [] curr_entry_mem_addr = [] mem_file_done = False outfile = open(out_filename, 'w') mem_tick = 0 mem_line = '' mem_entry = [] for line in dis_io:
def buffered_flo(content): clean = re.sub("data:application/octet-stream;base64,", '', content) floBytes = io.BytesIO(base64.b64decode(clean)) return io.BufferedReader(floBytes)
def parse(): self.context_ = [dict()] with fopen(self.filename, "r") as file: iobuf = io.BufferedReader(file) for line in iobuf: parse_line(line, self.context_)
def _makefile(sock, mode): return io.BufferedReader(SocketIO(sock, mode))
def textblock(filename, start, end, compression=None, encoding=system_encoding, linesep=os.linesep, buffersize=4096): """Pull out a block of text from a file given start and stop bytes. This gets data starting/ending from the next linesep delimiter. Each block consists of bytes in the range [start,end[, i.e. the stop byte is excluded. If `start` is 0, then `start` corresponds to the true start byte. If `start` is greater than 0 and does not point to the beginning of a new line, then `start` is incremented until it corresponds to the start byte of the next line. If `end` does not point to the beginning of a new line, then the line that begins before `end` is included in the block although its last byte exceeds `end`. Examples -------- >> with open('myfile.txt', 'wb') as f: .. f.write('123\n456\n789\nabc') In the example below, 1 and 10 don't line up with endlines. >> u''.join(textblock('myfile.txt', 1, 10)) '456\n789\n' """ # Make sure `linesep` is not a byte string because # `io.TextIOWrapper` in Python versions other than 2.7 dislike byte # strings for the `newline` argument. linesep = str(linesep) # Get byte representation of the line separator. bin_linesep = get_bin_linesep(encoding, linesep) bin_linesep_len = len(bin_linesep) if buffersize < bin_linesep_len: error = ('`buffersize` ({0:d}) must be at least as large as the ' 'number of line separator bytes ({1:d}).') raise ValueError(error.format(buffersize, bin_linesep_len)) chunksize = end - start with open(filename, 'rb', compression) as f: with io.BufferedReader(f) as fb: # If `start` does not correspond to the beginning of the file, we # need to move the file pointer to `start - len(bin_linesep)`, # search for the position of the next a line separator, and set # `start` to the position after that line separator. if start > 0: # `start` is decremented by `len(bin_linesep)` to detect the # case where the original `start` value corresponds to the # beginning of a line. start = max(0, start - bin_linesep_len) # Set the file pointer to `start`. fb.seek(start) # Number of bytes to shift the file pointer before reading a # new chunk to make sure that a multi-byte line separator, that # is split by the chunk reader, is still detected. shift = 1 - bin_linesep_len while True: buf = f.read(buffersize) if len(buf) < bin_linesep_len: raise StopIteration try: # Find the position of the next line separator and add # `len(bin_linesep)` which yields the position of the # first byte of the next line. start += buf.index(bin_linesep) start += bin_linesep_len except ValueError: # No line separator was found in the current chunk. # Before reading the next chunk, we move the file # pointer back `len(bin_linesep) - 1` bytes to make # sure that a multi-byte line separator, that may have # been split by the chunk reader, is still detected. start += len(buf) start += shift fb.seek(shift, os.SEEK_CUR) else: # We have found the next line separator, so we need to # set the file pointer to the first byte of the next # line. fb.seek(start) break with io.TextIOWrapper(fb, encoding, newline=linesep) as fbw: # Retrieve and yield lines until the file pointer reaches # `end`. while start < end: line = next(fbw) # We need to encode the line again to get the byte length # in order to correctly update `start`. bin_line_len = len(line.encode(encoding)) if chunksize < bin_line_len: error = ('`chunksize` ({0:d}) is less than the line ' 'length ({1:d}). This may cause duplicate ' 'processing of this line. It is advised to ' 'increase `chunksize`.') raise IOError(error.format(chunksize, bin_line_len)) yield line start += bin_line_len
def main(fq1, fq2, nseq=100000, seq_length=50, skip_first_N=1, verbose=True): is_gz = (fq1[-3:].lower() == '.gz') & (fq2[-3:].lower() == '.gz') if is_gz: f1 = io.TextIOWrapper(io.BufferedReader(gzip.open(fq1))) f2 = io.TextIOWrapper(io.BufferedReader(gzip.open(fq2))) else: f1 = open(fq1, 'rb') f2 = open(fq2, 'rb') num_read = 0 n = 0 try: g = fastq_paired_reader(f1, f2) # get candidate sequences candidates = {} candidate_count = collections.Counter() candidate_by_seq = {} sequence_list = [] seq_set = set() i = 0 skipped = 0 dupes = 0 while i < nseq: the_block = g.next() the_id = the_block.pop('id') seq_1 = the_block['seq_1'][:seq_length] seq_2 = the_block['seq_2'][:seq_length] the_seq = seq_1[skip_first_N:] + seq_2[skip_first_N:] the_N_count = len(the_seq) - len(the_seq.replace('N', '')) if the_N_count > 0: skipped += 1 continue candidates[the_id] = { 'seq_1': seq_1, 'seq_2': seq_2, 'hash_seq': the_seq, } i += 1 if the_seq in seq_set: candidate_count[candidate_by_seq[the_seq]] += 1 dupes += 1 else: seq_set.add(the_seq) candidate_by_seq[the_seq] = the_id candidate_count[the_id] += 1 sequence_list.append((the_seq, the_id)) if verbose: print "Finished recording %d candidate pairs. " \ "Skipped %d as they contained undetermined bases. " \ "Identified %d duplicates" % ( nseq, skipped, dupes ) skipped = 0 # run through the remainder for i, the_block in enumerate(g): if verbose and i % 500000 == 0 and i != 0: print "%d lines read" % i seq_1 = the_block.pop('seq_1')[:seq_length] seq_2 = the_block.pop('seq_2')[:seq_length] the_seq = seq_1[skip_first_N:] + seq_2[skip_first_N:] if 'N' in the_seq: skipped += 1 continue if the_seq in seq_set: candidate_count[candidate_by_seq[the_seq]] += 1 finally: f1.close() f2.close() return candidate_count, candidates
def __init__(self, filename=None, mode="r", *, format=None, check=-1, preset=None, filters=None): """Open an LZMA-compressed file in binary mode. filename can be either an actual file name (given as a str or bytes object), in which case the named file is opened, or it can be an existing file object to read from or write to. mode can be "r" for reading (default), "w" for (over)writing, "x" for creating exclusively, or "a" for appending. These can equivalently be given as "rb", "wb", "xb" and "ab" respectively. format specifies the container format to use for the file. If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the default is FORMAT_XZ. check specifies the integrity check to use. This argument can only be used when opening a file for writing. For FORMAT_XZ, the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not support integrity checks - for these formats, check must be omitted, or be CHECK_NONE. When opening a file for reading, the *preset* argument is not meaningful, and should be omitted. The *filters* argument should also be omitted, except when format is FORMAT_RAW (in which case it is required). When opening a file for writing, the settings used by the compressor can be specified either as a preset compression level (with the *preset* argument), or in detail as a custom filter chain (with the *filters* argument). For FORMAT_XZ and FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset level. For FORMAT_RAW, the caller must always specify a filter chain; the raw compressor does not support preset compression levels. preset (if provided) should be an integer in the range 0-9, optionally OR-ed with the constant PRESET_EXTREME. filters (if provided) should be a sequence of dicts. Each dict should have an entry for "id" indicating ID of the filter, plus additional entries for options to the filter. """ self._fp = None self._closefp = False self._mode = _MODE_CLOSED if mode in ("r", "rb"): if check != -1: raise ValueError("Cannot specify an integrity check " "when opening a file for reading") if preset is not None: raise ValueError("Cannot specify a preset compression " "level when opening a file for reading") if format is None: format = FORMAT_AUTO mode_code = _MODE_READ elif mode in ("w", "wb", "a", "ab", "x", "xb"): if format is None: format = FORMAT_XZ mode_code = _MODE_WRITE self._compressor = LZMACompressor(format=format, check=check, preset=preset, filters=filters) self._pos = 0 else: raise ValueError("Invalid mode: {!r}".format(mode)) if isinstance(filename, (str, bytes)): if "b" not in mode: mode += "b" self._fp = builtins.open(filename, mode) self._closefp = True self._mode = mode_code elif hasattr(filename, "read") or hasattr(filename, "write"): self._fp = filename self._mode = mode_code else: raise TypeError( "filename must be a str or bytes object, or a file") if self._mode == _MODE_READ: raw = _compression.DecompressReader(self._fp, LZMADecompressor, trailing_error=LZMAError, format=format, filters=filters) self._buffer = io.BufferedReader(raw)
def pipeline_test(generator, expected, command, workers=1, sources=1, mode='framed', sinks=1, decoder=None, pre_processor=None, batch_size=1, sink_expect=None, sink_expect_allow_more=False, sink_stop_timeout=DEFAULT_SINK_STOP_TIMEOUT, sink_await=None, delay=30, validate_file=None, giles_mode=False, host='127.0.0.1', listen_attempts=1, ready_timeout=30, runner_join_timeout=DEFAULT_RUNNER_JOIN_TIMEOUT, resilience_dir=None, spikes={}, persistent_data={}): """ Run a pipeline test without having to instrument everything yourself. This only works for 1-source, 1-sink topologies. Parameters: - `generator`: either a single data generator to use in a Sender's Reader, or a list of tuples of (generator, source_index) for use with multi-source applications. In the latter case, the senders are run sequentially, and the index is 0-based against the input addresses. the values in this set should be either strings or stringable. If they are custom data structures, they should already be encoded as strings. - `expectd`: the expect output set, to be compared against the received output. The data should be directly comparable to the decoded output. - `command`: the command to run each worker. Make sure to leave out the Wallaroo parameters: `--in`, `--out`, `--metrics`, `--data`, `--control`, `--external`, `--workers`, `--name`, `--cluster-initializer`, and `--ponynoblock`. These will be applied by the test setup utility. - `workers`: the number of workers to use in the test. Default: 1. - `sources`: the number of sources in the application. Default: 1. - `mode`: the decoding mode to use in the sink. Can be `'framed'` or `'newlines'`. Default: `'framed'` - `sinks`: the number of sinks to set up for the application. Default: 1. - `decoder`: an optional decoder to use for decoding the data from the sink. Default: None, assume data is strings. - `pre_processor`: an optional pre-processor to apply to the entire output set before comparing it against the expected data set. Default: None, assume output data is directly comparable. - `batch_size`: the batch size to use in the sender. Default: 1 - `sink_expect`: the number of messages to expect at the sink. This allows directly relying on received output for timing control. Default: None Should be a list of `len(sinks)`. - `sink_expect_allow_more`: Bool (default False): allow more messages in sink after `sink_expect` values have been received. - `sink_await`: a list of (binary) strings to await for at the sink. Once all of the await values have been seen at the sink, the test may be stopped. - `sink_stop_timeout`: the timeout in seconds to use when awaiting an expected number of messages at the sink. Raise an error if timeout elapses. Default: 30 Can be a number or a list of numbers of `len(sinks)`. - `delay`: Wait for `delay` seconds before stopping the cluster. Default 30 seconds. Only used if `sink_expect` and `sink_await` are both `None`. - `validate_file`: save sink data to a file to be validated by an external process. - `giles_mode`: if True, include a 64-bit timestamp between the length header and the payload when saving sink data to file. This is a backward compatibility mode for validators that expected giles-receiver format. - `host`: the network host address to use in workers, senders, and receivers. Default '127.0.0.1' - `listen_attempts`: attempt to start an applicatin listening on ports that are provided by the system. After `listen_attempts` fail, raise an appropriate error. For tests that experience TCP_WAIT related errors, this value should be set higher than 1. Default 1. - `ready_timeout`: number of seconds before an error is raised if the application does not report as ready. Default 30 - `runner_join_timeout`: the timeout in seconds to use when waiting for the runners to exit cleanly. If the timeout is exceeded, the runners are killed and an error is raised. - `resilience_dir`: The directory where resilience file are kept. This path will be cleaned up before and after each run. - `spikes`: A dict of 3-tuples with the worker index as its key, and the spike parameters (probability, margin, seed) as its value. `expected` and the processed sink(s) data should be directly equatable. The test fails if they fail an equality assertion. If multiple sinks are used, then expected should match the flattened list of procssed sinks` data. e.g. if there are 2 sinks with the data [1,1,1] and [2,2,2] respectively, then expected should be [1,1,1,2,2,2]. """ try: if sink_expect is not None: if not isinstance(sink_expect, (list, tuple)): sink_expect = [sink_expect for x in range(sinks)] elif len(sink_expect) != sinks: # list/tuple, but wrong length if len(sink_expect) == 1: sink_expect = sink_expect * sinks else: # throw error, we don't know how to handle this raise ValueError("sink_expect must be either an integer " "or a list of integers whose length is the same as " "the number of sinks. Got {}." .format(sink_expect)) elif sink_await is not None: if len(sink_await) != sinks: sink_await = [sink_await[:] for x in range(sinks)] # Start cluster with Cluster(command=command, host=host, sources=sources, workers=workers, sinks=sinks, sink_mode=mode, worker_join_timeout=runner_join_timeout, is_ready_timeout = ready_timeout, res_dir=resilience_dir, persistent_data=persistent_data) as cluster: # Create senders if generator: if not isinstance(generator, list): generator = [(generator, 0)] for gen, idx in generator: reader = Reader(gen) sender = Sender(cluster.source_addrs[idx], reader, batch_size=batch_size) cluster.add_sender(sender) # start each sender and await its completion before starting the next if cluster.senders: for sender in cluster.senders: sender.start() sender.join() try: assert(sender.error is None) except Exception as err: logging.error("Sender exited with an error") raise sender.error logging.debug('All senders completed sending.') else: logging.debug("No external senders were given for the cluster.") # Use sink, metrics, or a timer to determine when to stop the # runners and sinks and begin validation if sink_expect: logging.debug('Waiting for {} messages at the sinks with a timeout' ' of {} seconds'.format(sink_expect, sink_stop_timeout)) for sink, sink_expect_val in zip(cluster.sinks, sink_expect): logging.debug("SinkExpect on {} for {} msgs".format(sink, sink_expect_val)) cluster.sink_expect(expected=sink_expect_val, timeout=sink_stop_timeout, sink=sink, allow_more=sink_expect_allow_more) elif sink_await: logging.debug('Awaiting {} values at the sinks with a timeout of ' '{} seconds'.format(sum(map(len, sink_await)), sink_stop_timeout)) for sink, sink_await_vals in zip(cluster.sinks, sink_await): cluster.sink_await(values=sink_await_vals, timeout=sink_stop_timeout, sink=sink) else: logging.debug('Waiting {} seconds before shutting down ' 'cluster.' .format(delay)) time.sleep(delay) # join stoppers and check for errors cluster.stop_cluster() ############ # Validation ############ if validate_file: validation_files = validate_file.split(',') for sink, fp in zip(cluster.sinks, validation_files): sink.save(fp, giles_mode) # let the code after 'finally' return our data else: # compare expected to processed logging.debug('Begin validation phase...') # Decode captured output from sink if decoder: if not isinstance(decoder, (list, tuple)): decoder = [decoder for s in cluster.sinks] decoded = [] for sink, decoder in zip(cluster.sinks, decoder): decoded.append([]) for item in sink.data: decoded[-1].append(decoder(item)) else: decoded = [sink.data for sink in cluster.sinks] if pre_processor: processed = pre_processor(decoded) else: processed = decoded # Validate captured output against expected output if isinstance(expected, basestring): expected = io.BufferedReader(io.BytesIO(expected)) if hasattr(expected, 'read') and hasattr(expected, 'tell'): if isinstance(processed, list): bytesio = io.BytesIO() for part in processed: for p in part: bytesio.write(p) bytesio.seek(0) processed = io.BufferedReader(bytesio) elif isinstance(processed, basestring): processed = io.BufferedReader(io.BytesIO(processed)) # compare 50 bytes at a time while True: start_block = expected.tell() proc = processed.read(50) exp = expected.read(50) if not proc and not exp: break try: assert(exp == proc) except: raise AssertionError("Validation failed in bytes {}:{}" " of expected file. Expected {!r}" " but received {!r}.".format( start_block, expected.tell(), exp, proc)) else: flattened = list(itertools.chain.from_iterable(processed)) if mode == 'newlines': # add newlines to expected expected = [e + b'\n' for e in expected] try: assert(expected == flattened) except: raise AssertionError("Validation failed. Expected {!r} but" " received {!r}".format(expected, processed)) except: logging.error("Integration pipeline_test encountered an error") logging.error("The last 10 lines of each worker were:\n\n{}".format( runner_data_format(persistent_data.get('runner_data', []), from_tail=FROM_TAIL))) raise # Return runner names and outputs if try block didn't have a return return
def clash_results(config, bot, update, args): send_uploading_photo_action(bot, update) username = update.message.from_user.username clash_ids = [] results = {} if args: clash_ids = (list(set(args))) else: last_id = get_last_game(config, username, update.message.chat_id)["clash_id"] if last_id: clash_ids = [last_id] if not clash_ids: clash_results_usage(config, bot, update) return for clash_id in clash_ids: r = requests.post('https://www.codingame.com/services/ClashOfCodeRemoteService/findClashReportInfoByHandle', headers={"content-type":"application/json;charset=UTF-8"}, data='[{}]'.format(clash_id)) if r.status_code == 200: results = json.loads(r.text) if "success" in results and results["success"]: leaderboard = [] clash_mode = results["success"]["mode"].capitalize() if "mode" in results["success"] else "Unknown" message = ''' Game id: {clash_id} Game mode: {clash_mode} Status: {clash_status} Creation time: {clash_creation_time} '''.format( clash_id=clash_id, clash_mode=clash_mode, clash_creation_time=results["success"]["creationTime"], clash_status="Finished" if results["success"]["finished"] else "In progress") if clash_mode != "Unknown": headers=["", "Username", "Language", "Score", "Time"] if clash_mode == "Shortest": headers.append("Characters") for player in results["success"]["players"]: cache = [] cache.insert(0, player["rank"] if "rank" in player else 0) cache.insert(1, player["codingamerNickname"] if "codingamerNickname" in player else "Unknown") cache.insert(2, player["languageId"] if "languageId" in player else "Unknown") cache.insert(3, '{}%'.format(player["score"] if "score" in player else "0")) cache.insert(4, str(datetime.timedelta(milliseconds=player["duration"] if "duration" in player else 0)).split('.', 2)[0]) if clash_mode == "Shortest": cache.insert(5, player["criterion"] if "criterion" in player else 0) leaderboard.insert(player["rank"] if "rank" in player else 0, cache) message += tabulate(sorted(leaderboard), headers, tablefmt='psql') message += "\n" message = "\n".join([i.strip() for i in message.split('\n')]) img_byte_arr = clash_results_to_byte_arr(message) bot.sendPhoto(chat_id=update.message.chat_id, photo=io.BufferedReader(img_byte_arr), caption='https://www.codingame.com/clashofcode/clash/report/{}'.format( clash_id)) log_print("Results", chat_id=update.message.chat_id, username=username, clash_id=clash_id, level="INFO", command="clash_results")
def gunzip(fileobj): is_gzipped = fileobj.read(2) == b'\037\213' fileobj.seek(-2, os.SEEK_CUR) if is_gzipped: fileobj = io.BufferedReader(gzip.GzipFile(fileobj=fileobj)) return fileobj
def merge(self, git_repo_url, hg_repo_url, branch=None): # Eventually we'll want to handle a full merge, but for now, we only # handle the case where we don't have metadata to begin with. # The caller should avoid calling this function otherwise. assert not self._has_metadata remote_refs = OrderedDict() for line in Git.iter('ls-remote', fsdecode(git_repo_url), stderr=open(os.devnull, 'wb')): sha1, ref = line.split(None, 1) remote_refs[ref] = sha1 bundle = None if not remote_refs and urlparse(git_repo_url).scheme in (b'http', b'https'): try: bundle = HTTPReader(git_repo_url) except URLError as e: logging.error(e.reason) return False BUNDLE_SIGNATURE = b'# v2 git bundle\n' signature = bundle.read(len(BUNDLE_SIGNATURE)) if signature != BUNDLE_SIGNATURE: logging.error('Could not find cinnabar metadata') return False bundle = io.BufferedReader(bundle) while True: line = bundle.readline().rstrip() if not line: break sha1, ref = line.split(b' ', 1) remote_refs[ref] = sha1 if branch: branches = [branch] else: branches = self._try_merge_branches(hg_repo_url) ref = self._find_branch(branches, remote_refs) if ref is None: logging.error('Could not find cinnabar metadata') return False if bundle: args = ('-v',) if util.progress else () proc = GitProcess('index-pack', '--stdin', '--fix-thin', *args, stdin=subprocess.PIPE, stdout=open(os.devnull, 'wb')) shutil.copyfileobj(bundle, proc.stdin) else: fetch = ['fetch', '--no-tags', '--no-recurse-submodules', '-q'] fetch.append('--progress' if util.progress else '--no-progress') fetch.append(fsdecode(git_repo_url)) cmd = fetch + [fsdecode(ref) + ':refs/cinnabar/fetch'] proc = GitProcess(*cmd, stdout=sys.stdout) if proc.wait(): logging.error('Failed to fetch cinnabar metadata.') return False # Do some basic validation on the metadata we just got. commit = GitCommit(remote_refs[ref]) if b'cinnabar@git' not in commit.author: logging.error('Invalid cinnabar metadata.') return False flags = set(commit.body.split()) if b'files-meta' not in flags or b'unified-manifests-v2' not in flags \ or len(commit.parents) != len(self.METADATA_REFS): logging.error('Invalid cinnabar metadata.') return False # At this point, we'll just assume this is good enough. # Get replace refs. if commit.tree != EMPTY_TREE: errors = False by_sha1 = {} for k, v in util.iteritems(remote_refs): if v not in by_sha1: by_sha1[v] = k needed = [] for line in Git.ls_tree(commit.tree): mode, typ, sha1, path = line if sha1 in by_sha1: ref = b'refs/cinnabar/replace/%s' % path if bundle: Git.update_ref(ref, sha1) else: needed.append( fsdecode(b':'.join((by_sha1[sha1], ref)))) else: logging.error('Missing commit: %s', sha1) errors = True if errors: return False if not bundle: cmd = fetch + needed proc = GitProcess(*cmd, stdout=sys.stdout) if proc.wait(): logging.error('Failed to fetch cinnabar metadata.') return False Git.update_ref(b'refs/cinnabar/metadata', commit.sha1) self._metadata_sha1 = commit.sha1 GitHgHelper.reload() Git.delete_ref(b'refs/cinnabar/fetch') # TODO: avoid the duplication of code with __init__ metadata = self.metadata() if not metadata: # This should never happen, but just in case. logging.warn('Could not find cinnabar metadata') Git.delete_ref(b'refs/cinnabar/metadata') GitHgHelper.reload() return False metadata, refs = metadata self._has_metadata = True self._metadata_refs = refs if metadata else {} changesets_ref = self._metadata_refs.get(b'refs/cinnabar/changesets') self._generation = 0 if changesets_ref: commit = GitCommit(changesets_ref) for n, head in enumerate(commit.body.splitlines()): hghead, branch = head.split(b' ', 1) self._hgheads._previous[hghead] = (branch, 1) self._generation = n + 1 self._manifest_heads_orig = set(GitHgHelper.heads(b'manifests')) for line in Git.ls_tree(metadata.tree): mode, typ, sha1, path = line self._replace[path] = sha1 return True
def convert_and_filter_topk(args): """ Convert to lowercase, count word occurrences and save top-k words to a file """ counter = Counter() data_lower = os.path.join(args.output_dir, "lower.txt.gz") print("\nConverting to lowercase and counting word occurrences ...") with io.TextIOWrapper( io.BufferedWriter(gzip.open(data_lower, "w+")), encoding="utf-8" ) as file_out: # Open the input file either from input.txt or input.txt.gz _, file_extension = os.path.splitext(args.input_txt) if file_extension == ".gz": file_in = io.TextIOWrapper( io.BufferedReader(gzip.open(args.input_txt)), encoding="utf-8" ) else: file_in = open(args.input_txt, encoding="utf-8") for line in progressbar.progressbar(file_in): line_lower = line.lower() counter.update(line_lower.split()) file_out.write(line_lower) file_in.close() # Save top-k words print("\nSaving top {} words ...".format(args.top_k)) top_counter = counter.most_common(args.top_k) vocab_str = "\n".join(word for word, count in top_counter) vocab_path = "vocab-{}.txt".format(args.top_k) vocab_path = os.path.join(args.output_dir, vocab_path) with open(vocab_path, "w+") as file: file.write(vocab_str) print("\nCalculating word statistics ...") total_words = sum(counter.values()) print(" Your text file has {} words in total".format(total_words)) print(" It has {} unique words".format(len(counter))) top_words_sum = sum(count for word, count in top_counter) word_fraction = (top_words_sum / total_words) * 100 print( " Your top-{} words are {:.4f} percent of all words".format( args.top_k, word_fraction ) ) print(' Your most common word "{}" occurred {} times'.format(*top_counter[0])) last_word, last_count = top_counter[-1] print( ' The least common word in your top-k is "{}" with {} times'.format( last_word, last_count ) ) for i, (w, c) in enumerate(reversed(top_counter)): if c > last_count: print( ' The first word with {} occurrences is "{}" at place {}'.format( c, w, len(top_counter) - 1 - i ) ) break return data_lower, vocab_str
def load_from(hf: h5py.File, name: str) -> Any: with io.BufferedReader(DatasetIO(hf[name])) as bf: return torch.load(bf)
def gzip_open_encoded(file, encoding=None): return io.TextIOWrapper(io.BufferedReader(gzip.open(file)), encoding="utf8")
import io """ Um buffer que fornece acesso de nível superior a um objeto RawIOBase sequencial e legível. Ele herda o BufferedIOBase. Ao ler dados desse objeto, uma quantidade maior de dados pode ser solicitada no fluxo bruto subjacente e mantida em um buffer interno. Os dados em buffer podem ser retornados diretamente nas leituras subsequentes. O construtor cria um BufferedReader para o fluxo bruto legível fornecido e buffer_size. Se buffer_size for omitido, DEFAULT_BUFFER_SIZE será usado. """ bio = io.BytesIO(b'Luiz Filipy - Brasil 1234') br = io.BufferedReader(bio) # peek(): Retorna bytes do fluxo sem avançar a posição. No máximo, uma única leitura no fluxo bruto # é feita para satisfazer a chamada. O número de bytes retornados pode ser menor ou maior que o solicitado. print(br.peek()) # read(): Lê e retorna bytes da tamanho fornecido, ou se o tamanho não for fornecido ou negativo, # até o EOF ou se a chamada de leitura bloquearia no modo sem bloqueio. print(br.read(11)) # retorno: b'Luiz Filipy' # read1(): Lâ e retorna o tamanho de bytes com apenas uma chamada no fluxo bruto. # Se pelo menos um byte for armazenado em buffer, somente bytes em buffer serão retornados. # Caso contrário, é feita uma chamada de leitura de fluxo bruto. print(br.read1()) # retorno: b' - Brasil 1234'
def _wrap_reader_for_text(fp, encoding): if isinstance(fp.read(0), bytes): fp = io.TextIOWrapper(io.BufferedReader(fp), encoding) return fp
def parse(self, response): filename = response.meta.get('filename') # os.makedirs('./maine/', exist_ok=True) # with open('./maine/' + filename.replace('/', '-'), 'wb') as f: # f.write(response.body) # yield MaineItem(bill=response.meta.get('bill'), filename=filename.replace('/', '-'), name=response.meta.get('name'), url=response.url ) bill = '' #response.meta.get('bill') session = '127th' state = 'maine' bill_name = '' md5 = hashlib.md5(response.body).hexdigest() html = '' url = response.url date = '' chamber = 'House & Senate' # for i in chamber: # if i in bill_name: # chamber = chamber.get(i) # break topic = '#TODO' # text from pdf bytesio = io.BytesIO(response.body) bfr = io.BufferedReader(bytesio) pdf_text = convert_pdf_to_txt(bfr) if response.url.strip()[-4:].lower() == '.pdf' else 'unsupported file' # recognized text (OCR) from pdf if len(pdf_text.strip()) <= 50: with wi(filename=response.url, resolution=200) as pdf: pdfImage = pdf.convert('jpeg') imageBlobs = [] for img in pdfImage.sequence: with wi(image = img) as imgPage: imageBlobs.append(imgPage.make_blob('jpeg')) recognized_text = [] for imgBlob in imageBlobs: im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang = 'eng') recognized_text.append(text) recognized_text = '\n\n\n'.join(recognized_text) pdf_text = pdf_text if len(pdf_text.strip()) > 50 else recognized_text yield MaineItem(md5=md5, html=html, session=session, bill_name=bill_name, url=url, state=state, date=date, chamber=chamber, topic=topic, text=pdf_text)