def _initialize(self, reset=False): if not reset: try: print("Initializing VariantHDF5 ...") print("\tReading {} ...".format(self._variant_hdf5_file_path)) self._variant_hdf5 = open_file(self._variant_hdf5_file_path, mode="r") print("\tReading {} ...".format( self._id_chrom_pickle_gz_file_path)) with gzip_open(self._id_chrom_pickle_gz_file_path ) as id_chrom_pickle_gz_file: self._id_chrom = load(id_chrom_pickle_gz_file) print("\tReading {} ...".format( self._gene_chrom_pickle_gz_file_path)) with gzip_open(self._gene_chrom_pickle_gz_file_path ) as gene_chrom_pickle_gz_file: self._gene_chrom = load(gene_chrom_pickle_gz_file) except (OSError, FileNotFoundError, HDF5ExtError) as exception: warn("\tFailed: {}.".format(exception)) reset = True if reset: print("Resetting ...") if self._variant_hdf5: self._variant_hdf5.close() print("\tClosed {} ...".format(self._variant_hdf5_file_path)) print("\tMaking {} ...".format(self._variant_hdf5_file_path)) self._make_variant_hdf5() print("\tReading {} ...".format(self._variant_hdf5_file_path)) self._variant_hdf5 = open_file(self._variant_hdf5_file_path, mode="r")
def corpus_reader( path: str, from_memory: bool = False ) -> Union[Tuple[str, None], Tuple[None, str]]: """Function to read corpus text file. Args: path: Path to the file. from_memory: To "read" from memory Returns: Corpus text and error string in case of any. Raises: IOError: Occurred on reading/unpacking error. """ try: if from_memory: return path.read(), None else: if path.endswith(".gz"): with gzip_open(path, 'rb') as f: return f.read(), None else: with open(path, 'r', encoding='utf-8') as f: return f.read(), None except IOError as ex: return None, ex
def gunzip(source_filepath, dest_filepath=None, block_size=65536, remove_source=False, stdout_file=None): if not dest_filepath: dest_filepath = source_filepath.strip('.gz') if os.path.isdir(dest_filepath): file_name = source_filepath.split(SPLITTER)[-1].replace('.gz', '') dest_filepath = add_slash(dest_filepath) + file_name print('Gunzipping ', source_filepath, 'to', dest_filepath, flush=True, file=stdout_file) with gzip_open(source_filepath, 'rb') as s_file, \ open(dest_filepath, 'wb') as d_file: while True: block = s_file.read(block_size) if not block: break else: d_file.write(block) d_file.write(block) if remove_source: os.remove(source_filepath)
def walk(self, file_name): if file_name.endswith('.gz'): with gzip_open(file_name) as current_file: yield self._get_submit_from_xml(current_file) else: with open(file_name, encoding='utf-8') as current_file: yield self._get_submit_from_xml(current_file)
def read_file(input_file_path): infile = None if input_file_path.endswith('.gz'): infile = gzip_open(input_file_path) else: infile = open(input_file_path, 'r+b') count = 0 total_records = 0 chunk = StringIO() for line in infile: parsed = None if len(UNIPROT_RECORD_TERMINATOR.findall(line)) == 0: chunk.write(line) else: count += 1 if count >= PRINT_LIMIT: total_records += count count = 0 logging.info("Processed " + str(total_records)) try: parsed = parse_chunk(chunk) except Exception as e: e.message pass chunk = StringIO() if parsed is None: continue yield parsed infile.close()
def fetch_and_archive(service, email, archive_path, mid_list): logger.info( 'fetch_and_archive started. email: %s, archive_path: %s, mid_list: %d message(s)' % (email, archive_path, len(mid_list)) ) if path_isabs(archive_path): output_dir = realpath(archive_path) else: output_dir = realpath(expanduser(path_join(getcwd(), archive_path))) count = 0 error = 0 for mid in mid_list: file_name = path_join(output_dir, ('%x.gz' % mid)) message = fetch_mail(service, email, mid) if not message: error += 1 continue with gzip_open(file_name, 'wb') as f: f.write(urlsafe_b64decode(message['raw'])) logger.debug('Message id %x gzipped to %s.' % (mid, file_name)) count += 1 logger.info('fetch_and_archive completed. Total %d item(s) saved. Error %d item(s).' % (count, error))
def load_from_pickle_gz(file): file = gzip_open(file, "rb") while True: try: yield pickle.load(file) except (EOFError, pickle.UnpicklingError): return
def _load_index_json(self, app_cache): index_json_gz_filename = os.path.join(app_cache.get_cache_dir(), '.index.json.gz') self._verify_file(index_json_gz_filename) with gzip_open(index_json_gz_filename, 'rb') as fgzip: content = fgzip.read() return loads(content)
def read_file(input_file_path): infile = None if input_file_path.endswith('.gz'): infile = gzip_open(input_file_path) else: infile = open(input_file_path, 'r+b') count = 0 total_records = 0 csv_reader = reader(infile, delimiter='\t') for line in csv_reader: count += 1 if count >= PRINT_LIMIT: total_records += count count = 0 logging.info("Processed " + str(total_records)) item = {} for index, field in enumerate(FIELDNAMES): item[field] = line[index] item['start'] = int(item['start']) item['end'] = int(item['end']) yield item infile.close()
def merge_contents(filelist): """ Merges a list of Contents files and returns a dict of the merged files """ pkgs = {} for i in filelist: if i and isfile(i): cfile = gzip_open(i).read() cfile = cfile.decode('utf-8') contents = cfile.split('\n') header = False for line in contents: if line.startswith('This file maps each file'): header = True if line.startswith('FILE'): header = False continue if line != '' and not header: sin = line.split() if sin[-1] not in pkgs.keys(): pkgs[sin[-1]] = [] pkgs[sin[-1]].append(' '.join(sin[:-1])) return pkgs
def _extract_memory_info(self, dump_pathname, processor_notes): """Extract and return the JSON data from the .json.gz memory report. file""" def error_out(error_message): processor_notes.append(error_message) return {"ERROR": error_message} try: fd = gzip_open(dump_pathname, "rb") except IOError as x: error_message = "error in gzip for %s: %r" % (dump_pathname, x) return error_out(error_message) try: memory_info_as_string = fd.read() if len(memory_info_as_string) > self.config.max_size_uncompressed: error_message = ( "Uncompressed memory info too large %d (max: %d)" % ( len(memory_info_as_string), self.config.max_size_uncompressed, ) ) return error_out(error_message) memory_info = json_loads(memory_info_as_string) except IOError as x: error_message = "error in gzip for %s: %r" % (dump_pathname, x) return error_out(error_message) except ValueError as x: error_message = "error in json for %s: %r" % (dump_pathname, x) return error_out(error_message) finally: fd.close() return memory_info
def __init__(self, db, filename, compressed = False, index_offset_bits = 32): """ Arguments: - `filename`: the filename of .idx file of stardict. - `compressed`: indicate whether the .idx file is compressed. - `index_offset_bits`: the offset field length in bits. """ self.db = db self._offset = 0 s = self.db["_word_idx"].select().limit(1) rs = s.execute() result = rs.fetchone() if result is None : self._size = getsize(filename) if compressed: self.fh = gzip_open(filename, "rb") else: self.fh = open(filename, "rb") self._index = 0 self._index_offset_bits = index_offset_bits #self.db["_word_idx"] = OOBTree() #self.db["_index_idx"] = OOBTree() trans = self.db["conn"].begin() for word_str, word_data_offset, word_data_size, index in self: #self.db["_index_idx"][self._index - 1] = (word_str, word_data_offset, word_data_size) i = self.db["_index_idx"].insert().values(idx = self._index - 1, word_str = word_str.decode("utf-8"), word_data_offset = word_data_offset, word_data_size = word_data_size) self.db["conn"].execute(i) #if word_str not in self.db["_word_idx"]: # self.db["_word_idx"][word_str] = [] #self.db["_word_idx"][word_str].append(self._index - 1) s = self.db["_word_idx"].select().where(self.db["_word_idx"].c.word_str == word_str.decode("utf-8")) rs = s.execute() result = rs.fetchone() t = time() if result is None : i = self.db["_word_idx"].insert().values(word_str = word_str.decode("utf-8"), idx = str([])) self.db["conn"].execute(i) rs = s.execute() result = rs.fetchone() newlist = eval(result[1]) newlist.append(self._index - 1) j = self.db["_word_idx"].update().values(idx = str(newlist)).where(self.db["_word_idx"].c.word_str == word_str.decode("utf-8")) self.db["conn"].execute(j) trans.commit() del self._index_offset_bits mdebug("There were " + str(self._offset) + " total words.")
def full_io(url, path, remove=True): path = UnsyncFetch.fetch_file(semaphore, 'get', dict(url=url), path, 1).result() with gzip_open(path, 'rt') as handle: mmcif_dict = MMCIF2DictPlus(handle, ('_pdbe_chain_remapping.', )) if remove: path.unlink() return mmcif_dict
def main(): # parse the command line requireEof = True markEndOfFile = False filenames = [] for arg in argv[1:]: if (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg in ["--markend]", "--markeof"]): requireEof = False markEndOfFile = True else: filenames += [arg] if (filenames == []): usage("you have to give me at least one file") # copy the files; note that we don't bother (or care) to verify that they # are really output from ncrf for (ix, filename) in enumerate(filenames): if (ix > 0): print eofMarkerSeen = False if (filename.endswith(".gz")) or (filename.endswith(".gzip")): f = gzip_open(filename, "rt") else: f = file(filename, "rt") for line in f: line = line.rstrip("\n") if (eofMarkerSeen) and (line != ""): exit( "%s: \"%s\" contains additional stuff after end marker (starting with \"%s\")" % (os_path.basename(argv[0]), filename, line[:10])) if (line == "# ncrf end-of-file"): eofMarkerSeen = True markEndOfFile = True continue if (not eofMarkerSeen): try: print line except IOError, ex: # "Broken pipe" can happen when downstream tools reject # our output as their input if (ex.errno == EPIPE): exit("%s: [Errno %d] Broken pipe" % (os_path.basename(argv[0]), ex.errno)) f.close() if (requireEof) and (not eofMarkerSeen): exit("%s: \"%s\" may have been truncated (end marker is absent)" % (os_path.basename(argv[0]), filename))
def _from_dbs_and_cache(fn, ds): print 'hitting DBS %s for %s' % (ds, fn) from JMTucker.Tools.DBS import files_in_dataset files = files_in_dataset(ds, instance='phys03' if ds.endswith('/USER') else 'global') with gzip_open(fn, 'w') as f: for file in files: f.write(file) f.write('\n') return files
def _read_gzip_file(fn): print 'getting minbias file list from cache', fn files = [] with gzip_open(fn) as f: for line in f: line = line.strip() if line: files.append(line) return files
def trans_gz(gz_file: str, out_dir: str): if gz_file.split('.')[-1] == 'gz': with gzip_open(gz_file) as fr: with open(os.path.join(out_dir, 'prs_data'), 'w') as fw: for line in fr: fw.write(line.decode()) return os.path.join(out_dir, 'prs_data') else: return gz_file
def __init__(self, filename=JSONL_FILENAME): """Setup file for reading. Parameters ---------- filename : str Filename for JSONL file with CVR data. """ self.filename = filename if filename.endswith('.gz'): self.fid = gzip_open(filename, mode='rt') else: try: self.fid = open(filename) except IOError: self.fid = gzip_open(filename + '.gz', mode='rt') self.line_number = 0
def _extract_memory_info(self, dump_pathname, processor_notes): """Extract and return the JSON data from the .json.gz memory report. file""" try: fd = gzip_open(dump_pathname, "rb") except IOError, x: error_message = "error in gzip for %s: %r" % (dump_pathname, x) processor_notes.append(error_message) return {"ERROR": error_message}
def __init__(self, filename): """ Parameters ---------- filename: str full path input file name """ super().__init__(filename) self.file_object = gzip_open(filename, 'rb')
def dump_gps_map(gps_map, pickle_gz_file_path): if not pickle_gz_file_path.endswith(".pickle.gz"): pickle_gz_file_path += ".pickle.gz" with gzip_open(pickle_gz_file_path, mode="wb") as pickle_gz_file: dump(gps_map, pickle_gz_file)
def __init__(self, file): tmp = load(gzip_open(file, 'rb')) self.__models = tmp['models'] self.__conf = tmp['config'] self.__workpath = '.' self.Nlim = self.__conf.get('nlim', 1) self.TOL = self.__conf.get('tol', 1e10) self.__units = self.__conf.get('report_units') self.__show_structures = self.__conf.get('show_structures')
def __save(self): stashdump = self.__calc_stashdump() if stashdump is not None: with gzip_open(self.__fname, 'wb') as f: f.write(stashdump) if len(self.__properties) and self.__properties_changed: with self.__stash_lock: with open(self.__pname, 'w') as f: json.dump(self.__properties, f)
def cache_data(self, hash, file, data, url=True, gzip=True): if gzip and self.gzip: with gzip_open(self.get_path(hash, file + '.gz'), 'wb') as f: f.write(data) with open(self.get_path(hash, file), 'wb') as f: f.write(data) if url: return self.get_url(hash, file)
def save_message(mid, message, store_path): """ :type mid: int message id :type message: str raw MIME message :param store_path: str path to store :return: """ path = path_join(store_path, '%x.gz' % mid) with gzip_open(path, 'wb') as f: f.write(message)
def load_message(mid, store_path): """ :type mid: int message id :type store_path: str path to load :return: """ path = path_join(store_path, '%x.gz' % mid) with gzip_open(path, 'rb') as f: message = f.read() return message
def store_cache_to_file(self, _attr_name, data, gzip=False): filename = self.cache_dir + '/' + _attr_name + '.pickle' pickle_obj = pickle_dumps(data) if gzip: filename += '.gz' with gzip_open(filename, "wb") as f: f.write(pickle_obj) else: with open(filename, "wb") as f: f.write(pickle_obj)
def step04( datadir ): uvalues_by_dim = {}; with gzip_open( datadir+"/train_trn.tsv.gz", "rt" ) as f: firstline = f.readline(); if firstline and firstline[-1] == '\n': firstline = firstline[:-1]; firstline = firstline.split( '\t' ); assert \ firstline \ == ( [ '"id"', '"y"', '"cId"' ] + [ '"x{}"'.format(i) for i in range(1,101) ] ); for line in f: if line and line[-1] == '\n': line = line[:-1]; line = line.split( '\t' ); id_ = line[0]; y = line[1]; cid = line[2]; x = [ None ] + line[3:]; for dim in range(1,101): if not dim in uvalues_by_dim: uvalues_by_dim[ dim ] = set(); if len( uvalues_by_dim[dim] ) < 500: uvalues_by_dim[ dim ].add( x[dim] ); dim_by_uvalues = []; binary = []; for ( dim, uvalues ) in uvalues_by_dim.items(): if len( uvalues ) < 5: uvalues_ = uvalues; else: uvalues_ = None; if len( uvalues ) == 2: binary.append( dim ); dim_by_uvalues.append( ( len(uvalues), dim, uvalues_ ) ); with open( datadir+"/step04.txt", "wt" ) as out: for ( uvalues, dim, uvalues_ ) in sorted( dim_by_uvalues ): print( "{:3d} {:7d} {:s}".format( dim, uvalues, repr(uvalues_) ) ); print( "{:3d} {:7d} {:s}".format( dim, uvalues, repr(uvalues_) ), file=out ); print( "-->", repr(binary) ); print( "-->", repr(binary), file=out );
def gzip_file(source_path, archive_path): """ Create a gzip compressed archive of ``source_path`` at ``archive_path``. An empty archive file will be created if the source file does not exist. This gives the diagnostic archive a consistent set of files which can easily be tested. """ with gzip_open(archive_path, 'wb') as archive: if os.path.isfile(source_path): with open(source_path, 'rb') as source: copyfileobj(source, archive)
def test_downsample_16bit_image(self): with pushd_popd(tempdir=True) as tempdir: with gzip_open(join(dirname(__file__), 'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz'), 'rb') as gzip_in: with open('16bit.tif', 'wb') as tif_out: tif_out.write(gzip_in.read()) ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('IMG', ID='foo', url='16bit.tif', mimetype='image/tiff', pageId=None) pil_before = Image.open('16bit.tif') assert pil_before.mode == 'I;16' pil_after = ws._resolve_image_as_pil('16bit.tif') assert pil_after.mode == 'L'
def load_packages_file(filename): """ Load a gzip'd packages file. Returns a dictionary of package name and package key-values. """ # TODO: should we skip files like this if they don't exist? if filename is not None and isfile(filename): packages_contents = gzip_open(filename).read() packages_contents = packages_contents.decode('utf-8') return parse_packages(packages_contents) return None
def open(self, filename): """ Open specified file for writing. File will be compressed if the gzip flag of the constructor was set to True. Args: filename (str): path to file to open for writing """ if self.gzip: self.file = gzip_open(filename, 'wb') else: self.file = open(filename, 'wb')
def _uncompress_archive(self, app_cache, local_archive): try: with gzip_open(local_archive) as zipped_file: archive_content = zipped_file.read() with open(os.path.join(app_cache.get_cache_dir(), '.all.tar'), 'wb') as extracted_file: extracted_file.write(archive_content) except (zlib.error, EnvironmentError) as exc: self.warn('Error while reading %s: %s' % (local_archive, exc)) return False else: self._extract_archive(app_cache) return True
def _load_index_json(self, app_cache): index_json_gz_filename = os.path.join(app_cache.get_cache_dir(), '.index.json.gz') if not ucr_is_false('appcenter/index/verify'): detached_sig_path = index_json_gz_filename + '.gpg' (rc, gpg_error) = gpg_verify(index_json_gz_filename, detached_sig_path) if rc: if gpg_error: self.fatal(gpg_error) raise Abort('Signature verification for %s failed' % index_json_gz_filename) with gzip_open(index_json_gz_filename, 'rb') as fgzip: content = fgzip.read() return loads(content)
def get_archive(mid, archive_path): if path_isabs(archive_path): archive_dir = realpath(archive_path) else: archive_dir = realpath(expanduser(path_join(getcwd(), archive_path))) path = path_join(archive_dir, '%x.gz' % mid) with gzip_open(path, 'rb') as f: mime = f.read() logger.debug('Archive \'%s\' extracted successfully. %d bytes' % (path, len(mime))) return mime
def __init__(self, filename, dict_ifo, dict_index, compressed = False): """Constructor. Arguments: - `filename`: filename of .dict file. - `dict_ifo`: IfoFileReader object. - `dict_index`: IdxFileReader object. """ self._dict_ifo = dict_ifo self._dict_index = dict_index self._compressed = compressed self._offset = 0 if self._compressed: self.fh = gzip_open(filename, "rb") else: self.fh = open(filename, "rb")
def biom_open(fp, permission='U'): """Wrapper to allow opening of gzipped or non-compressed files Read or write the contents of a file file_fp : file path permission : either 'r','w','a' If the file is binary, be sure to pass in a binary mode (append 'b' to the mode); opening a binary file in text mode (e.g., in default mode 'U') will have unpredictable results. This code was copied from QIIME (www.qiime.org). """ if is_gzip(fp): return gzip_open(fp,'rb') else: return open(fp, permission)
def biom_open(fp, permission='U'): """Wrapper to allow opening of gzipped or non-compressed files Read or write the contents of a file file_fp : file path permission : either 'r','w','a' If the file is binary, be sure to pass in a binary mode (append 'b' to the mode); opening a binary file in text mode (e.g., in default mode 'U') will have unpredictable results. This function is ported from QIIME (http://www.qiime.org), previously named qiime_open. QIIME is a GPL project, but we obtained permission from the authors of this function to port it to the BIOM Format project (and keep it under BIOM's BSD license). """ if is_gzip(fp): return gzip_open(fp, 'rb') else: return open(fp, permission)
def __init__(self, outfile, mode='x'): """ Parameters ---------- outfile: Unicode full path output file name mode: str 'w' open for writing, truncating the file first 'x' open for exclusive creation, failing if the file already exists 'a' open for writing, appending to the end of the file if it exists Raises ------ FileNotFoundError: When the file cannot be opened FileExistsError: when infile exist and mode is x """ super().__init__(outfile) mode += 'b' try: self.file_object = gzip_open(outfile, mode) except FileExistsError: raise FileExistsError('file exists: {} and mode is {}'. format(outfile, mode))
def da_read( fn ): assert isfile( fn ); with gzip_open( fn, "rt" ) as f: firstline = f.readline(); if firstline and firstline[-1] == '\n': firstline = firstline[:-1]; firstline = firstline.split( '\t' ); has_y = None; if firstline[:3] == [ '"id"', '"y"', '"cId"' ]: has_y = True; elif firstline[:3] == [ '"id"', '"cId"', '"x1"' ]: has_y = False; else: assert False; if has_y: assert \ firstline \ == ( [ '"id"', '"y"', '"cId"' ] + [ '"x{}"'.format(i) for i in range(1,101) ] ); else: assert \ firstline \ == ( [ '"id"', '"cId"' ] + [ '"x{}"'.format(i) for i in range(1,101) ] ); x_check = {}; for line in f: if line and line[-1] == '\n': line = line[:-1]; line = line.split( '\t' ); id_ = line[0]; id_ = int( id_ ); if has_y: y = line[1]; assert y in [ "0", "1" ]; y = int( y ); rest = line[2:]; else: rest = line[1:]; c = rest[0]; assert c[0] == '"'; assert c[-1] == '"'; c = int( c[1:-1] ); b = []; x = []; for i in range( 1, len(rest) ): try: val = rest[i]; if i in BINARY_FEATs: assert val in [ "0", "1" ]; val = int(val) b.append( val ); continue; if not '.' in val: val = val+'.'; val = val.split( '.' ); assert \ ( ( val[0][0] == '-' ) and ( len(val[0]) == 2 ) ) \ or ( ( val[0][0] != '-' ) and ( len(val[0]) == 1 ) ); assert \ len( val[1] ) <= 3; while len( val[1] ) < 3: val[1] = val[1] + '0'; assert \ len( val[1] ) == 3; if val[0][0] == '-': val = - int( val[0][1:] ) * 1000 - int( val[1] ); else: val = int( val[0] ) * 1000 + int( val[1] ); assert ( float(val) / 1000.0 ) == float(rest[i]); x_check_ = x_check.get( i, set() ); if len( x_check_ ) < 3: x_check_.add( val ); x_check[ i ] = x_check_; x.append( val ); except: print( repr(val), rest[i] ); raise; if has_y: yield ( id_, y, [c], b, x ); else: yield ( id_, None, [c], b, x ); for v in x_check.values(): assert len( v ) > 2;
def step13( datadir ): with gzip_open( datadir+"/train.tsv.gz", "rt" ) as f: firstline = f.readline(); if firstline and firstline[-1] == '\n': firstline = firstline[:-1]; firstline = firstline.split( '\t' ); assert \ firstline \ == ( [ '"id"', '"y"', '"cId"' ] + [ '"x{}"'.format(i) for i in range(1,101) ] ); pos_valsx = []; neg_valsx = []; pos_rndx = []; neg_rndx = []; pos_valsx_ = []; neg_valsx_ = []; pos_rndx_ = []; neg_rndx_ = []; i = 1; for line in f: i += 1; if i > 10000: break; line_ = line; if line and line[-1] == '\n': line = line[:-1]; line = line.split( '\t' ); id_ = line[0]; y = line[1]; cid = line[2]; x = [ None ]; assert cid.startswith( '"' ); assert cid.endswith( '"' ); cid = int( cid[1:-1] ); for x_ in line[3:]: x.append( float(x_) ) relevant_x = x[cid]; random_x = choice( x[1:] ); if y == '0': neg_valsx.append(relevant_x); if relevant_x not in [ 0.0, 1.0 ]: neg_valsx_.append(relevant_x); neg_rndx.append(random_x); if random_x not in [ 0.0, 1.0 ]: neg_rndx_.append(random_x); elif y == '1': pos_valsx.append(relevant_x); if relevant_x not in [ 0.0, 1.0 ]: pos_valsx_.append(relevant_x); pos_rndx.append(random_x); if random_x not in [ 0.0, 1.0 ]: pos_rndx_.append(random_x); ( fig, ax ) = plt.subplots( nrows=2, ncols=2, figsize=(6,6) ); ax[0,0].hist( [ neg_valsx, pos_valsx ], 100, histtype='step', color='br', linewidth=3 ); ax[0,1].hist( [ neg_rndx, pos_rndx ], 100, histtype='step', color='br', linewidth=3 ); ax[1,0].hist( [ neg_valsx_, pos_valsx_ ], 100, histtype='step', color='br', linewidth=3 ); ax[1,1].hist( [ neg_rndx_, pos_rndx_ ], 100, histtype='step', color='br', linewidth=3 ); fig.savefig( datadir+'/step13.png' );
def __init__(self, thrift_type, filename, postprocess=None, filetype=FileType.AUTO): """ Args: thrift_type: Class for Thrift type, e.g. Communication, TokenLattice filename (str): postprocess (function): A post-processing function that is called with the Thrift object as argument each time a Thrift object is read from the file filetype (FileType): Expected type of file. Default value is `FileType.AUTO`, where function will try to automatically determine file type. Raises: ValueError: if filetype is not a known filetype name or id """ filetype = FileType.lookup(filetype) self._seek_supported = True self._thrift_type = thrift_type if postprocess is None: def _noop(obj): return self._postprocess = _noop else: self._postprocess = postprocess self._source_filename = filename if filetype == FileType.TAR: self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|') elif filetype == FileType.TAR_GZ: self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|gz') elif filetype == FileType.TAR_BZ2: self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|bz2') elif filetype == FileType.ZIP: self.filetype = 'zip' self.zip = zipfile.ZipFile(filename, 'r') self.zip_infolist = self.zip.infolist() self.zip_infolist_index = 0 elif filetype == FileType.STREAM: self.filetype = 'stream' f = open(filename, 'rb') elif filetype == FileType.STREAM_GZ: self.filetype = 'stream' f = gzip_open(filename, 'rb') elif filetype == FileType.STREAM_BZ2: self.filetype = 'stream' f = bz2.BZ2File(filename, 'r') elif filetype == FileType.AUTO: if tarfile.is_tarfile(filename): self.filetype = 'tar' self.tar = tarfile.open(filename, 'r|*') elif zipfile.is_zipfile(filename): self.filetype = 'zip' self.zip = zipfile.ZipFile(filename, 'r') self.zip_infolist = self.zip.infolist() self.zip_infolist_index = 0 elif mimetypes.guess_type(filename)[1] == 'gzip': # this is not a true stream---is_tarfile will have # successfully seeked backwards on the file if we have # reached this point self.filetype = 'stream' f = gzip_open(filename, 'rb') elif mimetypes.guess_type(filename)[1] == 'bzip2': # this is not a true stream self.filetype = 'stream' f = bz2.BZ2File(filename, 'r') else: # this is not a true stream self.filetype = 'stream' f = open(filename, 'rb') else: raise ValueError('unknown filetype %d' % filetype) if self.filetype == 'stream': self.transport = TTransport.TFileObjectTransport(f) self.protocol = factory.createProtocol(self.transport) self.transport.open()
n = getattr(new, name) print 'runs in old not in new:' print sorted(o-n) l = sorted(x for x in n-o if x < 157460) print 'runs in new not in old before 2011: #', len(l), 'min', l[0] if len(l) else None, 'max', l[-1] if len(l) else None l = sorted(x for x in n-o if x >= 157460) print 'runs in new not in old during 2011: #', len(l), 'min', l[0] if len(l) else None, 'max', l[-1] if len(l) else None print sys.exit(0) elif 'make_lists' in sys.argv: from datetime import datetime from gzip import open as gzip_open from MuonAnalysis.Cosmics.runregistry import RunRegistryHelper epoch = min_time = datetime(2010, 2, 1) rrh = RunRegistryHelper(gzip_open('download.xml.gz')) # Get this from the run registry Table->Get Data->Generate... then Table->Get Data->Export->XML (all). kinds = [ ('cosmics', ['Cosmic10', 'Cosmics10', 'Cosmics11']), ('commissioning', ['BeamCommissioning10', 'BeamCommisioning10', 'Commissioning', 'Commissioining10', 'Commisioning10', 'Commissioning10', 'Commissioning11']), # "commissioning" is hard to spell ('collisions', ['Collisions10', 'PostCollisions10', 'Collisions11']), ] for kind_label, kind_groups in kinds: for det in ['dt', 'csc', 'strip', 'pix', 'rpc']: runs = rrh.get_good_runs([det.upper()], min_time, kind_groups) print '%s_runs_%s = set(%s)' % (kind_label, det, repr(runs).replace(' ', '')) print elif 'dump_trigger_menus' in sys.argv: from gzip import open as gzip_open from MuonAnalysis.Cosmics.runregistry import RunRegistryHelper
def main(args=None): if args is None: args = sys.argv[1:] np.seterr(all='raise') parser, ns, args = init_args(description='Predict label for unlabeled sequences', args=args) parser = hmmer_args(parser) parser.add_argument('MODEL', type=PathType) parser.add_argument('SEQUENCES', type=PathType) ARGS = parse_args(parser, args, namespace=ns) with gzip_open(ARGS.MODEL, 'rb') as fh: try: model = pickle_load(fh) if model[0] != MODEL_VERSION: raise ImportError('incompatible model version') ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf = model[1:] except ImportError: msg = 'your model is not of the appropriate version, please re-learn your model' raise RuntimeError(msg) # create a temporary file wherein space characters have been removed with open(ARGS.SEQUENCES) as seq_fh: def seqrecords(): is_dna = ARGS.ENCODER == DNAEncoder seq_fmt = seqfile_format(ARGS.SEQUENCES) source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet) try: for record in source: yield record if is_dna else translate(record) except VerifyError: if is_dna: msg = ( "your model specifies a DNA encoding " "which is incompatible with protein sequences" ) raise RuntimeError(msg) source.set_alphabet(AminoAlphabet) for record in source: yield record try: fd, tmphmm = mkstemp(); close(fd) with open(tmphmm, 'wb') as hmm_fh: hmm_fh.write(hmm) # explicitly gc hmm hmm = None tmpaln = generate_alignment_(seqrecords(), tmphmm, ARGS) alignment = load_stockholm(tmpaln, trim=True) finally: if exists(tmphmm): remove(tmphmm) if exists(tmpaln): remove(tmpaln) X = extractor.transform(alignment) y = clf.predict(X) feature_names = extractor.get_feature_names() support = clf.named_steps['mrmr'].support_ labels = ['"{0:s}"'.format(feature_names[i]) for i, s in enumerate(support) if s] emptys = [' ' * (len(label) + 2) for label in labels] idlen = max(len(r.id) for r in alignment) + 3 print('{{\n "label": "{0:s}",\n "predictions": ['.format(ARGS.LABEL), file=ARGS.OUTPUT) for i, r in enumerate(alignment): if i > 0: print(',') features = ['[ '] for j, x in enumerate(X[i, support]): if x: features.append(labels[j]) features.append(', ') else: features.append(emptys[j]) features.append(' ]') # replace the last comma with a space idx = None for k, f in enumerate(features): if f == ', ': idx = k if idx is None: features[0] = features[0].rstrip() features[-1] = features[-1].lstrip() else: features[idx] = '' features_ = ''.join(features) print( ' {{{{ "id": {{0:<{0:d}s}} "value": {{1: d}}, "features": {{2:s}} }}}}'.format( idlen).format('"{0:s}",'.format(r.id), y[i], features_), file=ARGS.OUTPUT, end='') print('\n ]\n}', file=ARGS.OUTPUT) finalize_args(ARGS) return 0
def opener(fp, mode): codecs.getwriter('utf-8')(gzip_open(fp, mode))
def step05( datadir ): stats_by_b = {}; stats_by_cid_b = {}; with gzip_open( datadir+"/train_trn.tsv.gz", "rt" ) as f: firstline = f.readline(); if firstline and firstline[-1] == '\n': firstline = firstline[:-1]; firstline = firstline.split( '\t' ); assert \ firstline \ == ( [ '"id"', '"y"', '"cId"' ] + [ '"x{}"'.format(i) for i in range(1,101) ] ); for line in f: if line and line[-1] == '\n': line = line[:-1]; line = line.split( '\t' ); id_ = line[0]; y = line[1]; cid = line[2]; assert cid[0] == '"'; assert cid[-1] == '"'; cid = int( cid[1:-1] ); x = [ None ]; b = []; for i in range( 3, len(line) ): if (i-2) in BINARY_FEATs: b.append( line[i] ); else: x.append( line[i] ); b_ = 0; for i in range( 0, len(b) ): if b[i] == '0': b_i = 0; elif b[i] == '1': b_i = 1; else: assert False; b_ |= b_i << i; (total,pos) = stats_by_b.get( b_, (0,0) ); total += 1; if y == '1': pos += 1; stats_by_b[ b_ ] = ( total, pos ); (total,pos) = stats_by_cid_b.get( (cid,b_), (0,0) ); total += 1; if y == '1': pos += 1; stats_by_cid_b[ (cid,b_) ] = ( total, pos ); with open( datadir+"/step05.txt", "wt" ) as out: for b in sorted( stats_by_b ): ( total, pos ) = stats_by_b[ b ]; p = float(pos) / float(total); # print( "{:20s};{:7d};{:7d};{:1.4f}".format( hex(b), pos, total, p ) ); print( "{:20s};{:7d};{:7d};{:1.4f}".format( hex(b), pos, total, p ), file=out ); print( "-->", len(stats_by_b) ); for (cid,b) in sorted( stats_by_cid_b ): ( total, pos ) = stats_by_cid_b[ (cid,b) ]; p = float(pos) / float(total); # print( "{:20s};{:7d};{:7d};{:1.4f}".format( hex(cid)+'.'+hex(b), pos, total, p ) ); print( "{:20s};{:7d};{:7d};{:1.4f}".format( hex(cid)+'.'+hex(b), pos, total, p ), file=out ); print( "-->", len(stats_by_cid_b) );
def GzipType(string): try: return gzip_open(string, 'wb') except: return ArgumentTypeError("cannot open '{0:s}' for writing".format(string))
#!/usr/bin/env python from sys import argv from gzip import open as gzip_open from biom.parse import parse_biom_table if __name__ == '__main__': table = parse_biom_table(gzip_open(argv[1])) foo = table.transformSamples(lambda x, y, z: x)
# use the ratings in the newest created dataset datasets.sort(key=lambda ds: self.parse_timestamp(ds.find('CREATE_TIME').text)) ds = datasets[-1] cmps = ds.find('CMPS').findall('CMP') if all(self.is_good(cmps, subdet) for subdet in subdets): good.append(run_number) good.sort() return good if __name__ == '__main__': from gzip import open as gzip_open min_time = datetime(2010, 2, 1) rrh = RunRegistryHelper(gzip_open('download.xml.gz')) dt_st = rrh.get_good_runs(['DT', 'STRIP'], min_time) dt_px_st = rrh.get_good_runs(['DT', 'PIX', 'STRIP'], min_time) # "Histogram" of GROUP_NAMEs used. from collections import defaultdict from pprint import pprint d = defaultdict(list) for run in rrh.runs: d[rrh.group_name(run)].append((rrh.run_number(run), rrh.start_time(run))) to_show = 8 for k in sorted(d.keys()): v = d[k] v.sort() print k if len(v) > to_show: