def main(): parser = argparse.ArgumentParser( description='dataset generator' ) parser.add_argument( '-p', '--possibility', type=float, default=0.9, help='possibility to add train dataset' ) parser.add_argument( 'source', help='path to mecab-processed corpus (xz compressed)' ) parser.add_argument( 'train', help='path for writing training dataset (xz compressed)' ) parser.add_argument( 'test', help='path for writing testing dataset (xz compressed)' ) args = parser.parse_args() with lzma.open(args.source, 'rt') as source,\ lzma.open(args.train, 'wb') as train,\ lzma.open(args.test, 'wb') as test: separate(source, args.possibility, train, test)
def _open(self, mode, compress=None): self._txnstore.prepare_open(self._filename, mode) if compress is None and "compression" in self._options: compress = self._options["compression"] if compress is not None: if compress not in ["lzma", "xz"]: raise newfol.exception.FilemanipError("Compression type " + compress + " not supported") if "t" not in mode and "b" not in mode: mode += "t" if self._isfp: fp = self._file elif compress == "xz": check = -1 if "r" not in mode: check = lzma.CHECK_SHA256 fp = lzma.open(self._filename, mode, check=check) elif compress == "lzma": fp = lzma.open(self._filename, mode, format=lzma.FORMAT_ALONE) else: fp = open(self._filename, mode) self._txnstore.commit_open(self._filename, mode) return fp
def open_file(filename, mode, encoding=None): import sys, io binary = mode.endswith("b") mode = mode.rstrip("b") + "b" if mode.startswith("r"): if filename == "-": fileobj = sys.stdin.buffer else: fileobj = open(filename, mode) buf = fileobj.peek(100) if buf.startswith(b"\x1f\x8b\x08"): import gzip fileobj = gzip.open(fileobj, mode) elif buf[0:3] == b"BZh" and buf[4:10] == b"1AY&SY": import bz2 fileobj = bz2.open(fileobj, mode) elif buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")): import lzma fileobj = lzma.open(fileobj, mode) else: if filename == "-": fileobj = sys.stdout.buffer elif filename.endswith(".gz"): import gzip fileobj = gzip.open(filename, mode) elif filename.endswith(".bz2"): import bz2 fileobj = bz2.open(filename, mode) elif filename.endswith(".xz"): import lzma fileobj = lzma.open(filename, mode) else: fileobj = open(filename, mode) if binary: return fileobj else: return io.TextIOWrapper(fileobj, encoding=encoding, errors="surrogateescape", line_buffering=True)
def gplus_get_filehandler(write, fname): """Reserved for GraphicsPlus internal use""" if gplus_options["compress"]: if write: if os.path.exists(fname): return lzma.open(fname, "w") else: return lzma.open(fname, "x") else: return lzma.open(fname, "r") else: if write: return open(fname, "w+b") else: return open(fname, "rb")
def replace_syslinux_modules(syslinux_version, under_this_dir): # Replace modules files extracted from iso with corresponding # version provided by multibootusb. modules_src_dir = os.path.join( multibootusb_host_dir(), "syslinux", "modules", syslinux_version) for dirpath, dirnames, filenames in os.walk(under_this_dir): for fname in filenames: if not fname.lower().endswith('.c32'): continue dst_path = os.path.join(under_this_dir, dirpath, fname) src_path = os.path.join(modules_src_dir, fname) if not os.path.exists(src_path): log("Suitable replacement of '%s' is not bundled. " "Trying to unlzma." % fname) try: with lzma.open(dst_path) as f: expanded = f.read() except lzma.LZMAError: continue except (OSError, IOError) as e: log("%s while accessing %s." % (e, dst_path)) continue with open(dst_path, 'wb') as f: f.write(expanded) log("Successfully decompressed %s." % fname) continue try: os.remove(dst_path) shutil.copy(src_path, dst_path) log("Replaced %s module" % fname) except (OSError, IOError) as err: log(err) log("Could not update " + fname)
def universal_write_open(path, mode, buffering=-1, encoding=None, errors=None, newline=None, compresslevel=9, format=None, check=-1, preset=None, filters=None, compression=None): # pylint: disable=unexpected-keyword-arg,no-member if compression is None: return builtins.open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors, newline=newline) elif compression in ('gz', 'gzip'): if six.PY2: return gzip.open(path, mode=mode, compresslevel=compresslevel) else: return gzip.open(path, mode=mode, compresslevel=compresslevel, errors=errors, newline=newline, encoding=encoding) elif compression in ('lzma', 'xz'): try: import lzma except ImportError: from backports import lzma return lzma.open(path, mode=mode, format=format, check=check, preset=preset, filters=filters, encoding=encoding, errors=errors, newline=newline) elif compression == 'bz2': if six.PY2 or '__pypy__' in sys.builtin_module_names: import bz2file as bz2 # pylint: disable=import-error else: import bz2 return bz2.open(path, mode=mode, compresslevel=compresslevel, encoding=encoding, errors=errors, newline=newline) else: raise ValueError( 'compression must be None, gz, gzip, lzma, or xz and was {0}'.format(compression))
def writeLog(filename, data): try: with lzma.open(filename, "w") as f: f.write(bytes(data, 'UTF-8')) except: errstr = "Error: writeLog FAIL" raise edce.error.ErrorLog(errstr)
def get_uncompressed_stream(input_stream, compression="auto"): """ Returns a file-like object (aka stream) providing an uncompressed version of the content read on the input stream provided. :param input_stream: The file-like object providing compressed data. :param compression: The compression type. Specify "auto" to let the function guess it out of the associated filename (the input_stream needs to have a name attribute, otherwise a ValueError is raised). :type compression: str """ if compression == "auto": # Try to guess compression method if possible if hasattr(input_stream, 'name'): compression = guess_compression_method(input_stream.name) else: raise ValueError("Can't retrieve a name out of %r" % input_stream) if compression == "gzip": import gzip return gzip.open(filename=input_stream, mode="rb") elif compression == "bzip2": import bz2 return bz2.open(filename=input_stream, mode="rb") elif compression == "xz": import lzma return lzma.open(filename=input_stream, mode="rb") elif compression is None: return input_stream else: raise NotImplementedError( "Unknown compression method: %r" % compression)
def _load_kanjivg(fname): with lzma.open(fname, 'rt', encoding='utf-8') as f: tree = ElementTree() tree.parse(f) def parse_kanji(kanji): # Converts str('kvg:04e17-g7') to int(7) and str('kvg:04e7e-s11') to int(11) indexnum = lambda s: int(s.rpartition('-')[2][1:]) ididx = lambda elem: indexnum(elem.attrib['id']) strokes = [ stroke.attrib['d'] for stroke in sorted(kanji.findall('.//path'), key=ididx) ] gdata = sorted( (((group.attrib.get('element', group.attrib['id']), int(group.attrib.get('number', '0'))), group) for group in kanji.findall('.//g')), key=itemgetter(0)) # We use 0-indexed stroke numbers here so you can directly use them as indices to the strokes element. groups = [ [ ididx(path)-1 for (_elem, __number), group in foo for path in group.findall('.//path') ] for (elem, _number), foo in groupby(gdata, itemgetter(0)) ] return KanjiVGEntry(strokes, groups) # Converts str('kvg:kanji_05726') to str('тюд') kvgchr = lambda s: chr(int(s[len('kvg:kanji_'):].rstrip(string.ascii_letters+string.punctuation), 16)) return { kvgchr(kanji.attrib['id']): parse_kanji(kanji) for kanji in tree.findall('kanji') }
def download(addons, extr): info = workshopinfo(addons) for res in info: if not "title" in res: print("Addon does not exist!") return name = res['title'] download = res['file_url'] print("Downloading '%s' from the workshop" % name) w = Wgety() lzmafile = "%s.gma.lzma" % res['publishedfileid'] outfile = "%s.gma" % res['publishedfileid'] w.execute(url = download, filename = lzmafile) print("Downloaded '%s' from the workshop. Decompressing..." % name) with lzma.open(lzmafile) as lzmaF: with open(outfile, "wb") as gma: gma.write(lzmaF.read()) os.remove(lzmafile) if not extr: return name = re.sub('[\\/:"*?<>|]+', '_', name) gmafile.extract(outfile, name)
def open_compressed(filename,mode='rb'): """ Open a file for reading with automatic decompression. Detects gzip, xz, and bz2 files via the file extension. Arguments --------- filename to open Returns ------- open file object """ ext = filename.split('.')[-1] if ext == 'gz': import gzip return gzip.open(filename,mode) elif ext == 'xz': import lzma return lzma.open(filename,mode) elif ext == 'bz2': import bz2 return bz2.open(filename,mode) else: return open(filename,mode)
def dumpcache(self): sizes = [v.size for v in self._db.values()] logging.info("Dumping cache for {:,} profiles in {:d} buckets to {!s}".format( sum(sizes), len(self._db), self.bucket_dir())) bar = progressbar.ProgressBar(maxval=len(self._db), widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() newdirs = 0 counts = numpy.zeros(dtype=numpy.uint32, shape=(max(sizes)+1),) for (i, (k, v)) in enumerate(self._db.items()): path = self.bucket_name(k) if not path.parent.is_dir(): #logging.info("Creating directory {!s}".format(path.parent)) path.parent.mkdir(parents=True) newdirs += 1 # if v.nbytes > 1e6: # logging.debug("Storing {:d}/{:d}, {:,} bytes to {!s}".format( # i, len(self._db), v.nbytes, path)) counts[v.size] += 1 with lzma.open(str(path), mode="wb") as fp: numpy.save(fp, v) bar.update(i+1) bar.finish() logging.info("Stored cache. Created {:d} new directories. " "Profiles per bucket histogram: {!s}".format(newdirs, counts)) self.clearcache()
def db_iter(path='ucd.xml.xz'): with lzma.open(path, 'rb') as f: for (_, el) in et.iterparse(f): if '}' in el.tag: el.tag = el.tag.split('}', 1)[1] yield el el.clear()
def handle_savegame(root, file): filename = os.path.join(root,file) print("Handling savegame: " + filename); txt = None; with lzma.open(filename, mode="rt") as f: txt = f.read().split("\n"); status.savegames_read += 1; new_filename = "pbem_processed_" + str(random.randint(0,10000000000)) + ".xz"; f.close(); shutil.move(filename, os.path.join(root,new_filename)) print("New filename will be: " + new_filename); players = list_players(txt); phase = find_phase(txt); turn = find_turn(txt); game_id = find_game_id(txt); print("game_id=" + str(game_id)); print("phase=" + str(phase)); print("turn=" + str(turn)); print("players=" + str(players)); active_player = players[phase]; print("active_player=" + active_player); active_email = find_email_address(active_player); status.games[game_id] = [turn, phase, players, time.ctime()]; if (active_email != None): print("active email=" + active_email); m = MailSender(); m.send_email(active_player, players, active_email, new_filename.replace(".xz", ""), turn); status.emails_sent += 1;
def handle_savegame(filename): print("Handling " + filename); with lzma.open(filename) as f: txt =str(f.read()); phase = find_phase(txt); print(phase); print(txt);
def load(filename): with lzma.open(filename, 'rb') as dataset: while True: try: yield pickle.load(dataset) except EOFError: break
def download(addons, path, extr): info = workshopinfo(addons) for res in info: if not "title" in res: print("Addon does not exist!") return name = res['title'] download = res['file_url'] print("Downloading '%s' from the workshop" % name) lzmafile = os.path.join(path, "%s.gma.lzma" % res['publishedfileid']) outfile = os.path.join(path, "%s.gma" % res['publishedfileid']) urllib.request.urlretrieve(download, lzmafile, lambda x, y, z: sys.stdout.write("\r{0:.2f}%".format(x * y / z))) sys.stdout.write("\r100.00%\n") print("Downloaded '%s' from the workshop. Decompressing..." % name) with lzma.open(lzmafile) as lzmaF: with open(outfile, "wb") as gma: gma.write(lzmaF.read()) os.remove(lzmafile) if not extr: return name = os.path.join(path, re.sub('[\\/:"*?<>|]+', '_', name)) gmafile.extract(outfile, name)
def get_badge_data_and_write_function( host, badge_id, filename, require_file=False ): filename = host + '-' + filename logger.info("Loading {} badges...".format(filename)) try: f = lzma.open('data/' + filename + '.json.xz', 'rt') except FileNotFoundError: try: f = open('data/' + filename + '.json', 'rt') except FileNotFoundError: if not require_file: f = None else: raise if f: with f: badge_data = scraping.BadgeData.from_json(json.load(f)) else: badge_data = scraping.BadgeData(host=host, badge_id=badge_id) logger.info("...{} {} badges loaded.".format(len(badge_data), filename)) def write(): logger.info("Writing {} {} badges...".format(len(badge_data), filename)) with lzma.open('data/' + filename + '.json.xz', 'wt') as f: json.dump(badge_data.to_json(), f) logger.info("...wrote {} {} badges.".format(len(badge_data), filename)) return badge_data, write
def load_tickets(self, candidates, gvt_csv): with lzma.open(gvt_csv, 'rt') as fd: reader = csv.reader(fd) # skip introduction line next(reader) header = next(reader) it = sorted( named_tuple_iter('GvtRow', reader, header, PreferenceNo=int, TicketNo=int, OwnerTicket=lambda t: t.strip()), key=lambda gvt: (gvt.State, ticket_sort_key(gvt.OwnerTicket), gvt.TicketNo, gvt.PreferenceNo)) for (state_ab, ticket, ticket_no), g in itertools.groupby( it, lambda gvt: (gvt.State, gvt.OwnerTicket, gvt.TicketNo)): if state_ab != self.state_name: continue prefs = [] for ticket_entry in g: candidate = candidates.lookup_name_party( ticket_entry.Surname, ticket_entry.GivenNm, ticket_entry.PartyNm) prefs.append( (ticket_entry.PreferenceNo, candidate.CandidateID)) non_none = [x for x in prefs if x[0] is not None] self.raw_ticket_data.append(sorted(non_none, key=lambda x: x[0])) if ticket not in self.gvt: self.gvt[ticket] = [] self.gvt[ticket].append(PreferenceFlow(tuple(prefs)))
def xu_open(filename, mode='rb'): """ function to open a file no matter if zipped or not. Files with extension '.gz' or '.bz2' are assumed to be compressed and transparently opened to read like usual files. Parameters ---------- filename: filename of the file to open (full including path) mode: mode in which the file should be opened Returns ------- file handle of the opened file If the file does not exist an IOError is raised by the open routine, which is not caught within the function """ if filename.endswith('.gz'): fid = gzip.open(filename, mode) elif filename.endswith('.bz2'): fid = bz2.BZ2File(filename, mode) elif filename.endswith('.xz'): if sys.version_info >= (3, 3): fid = lzma.open(filename, mode) else: raise TypeError("File compression type not supported in Python " "versions prior to 3.3") else: fid = open(filename, mode) return fid
def retrieve_model(self): os.makedirs('models', exist_ok=True) file_name = f'{self.name()}model' # noqa: E999 file_path = os.path.join('models', file_name) model_url = f'https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/{file_name}.xz' # noqa r = requests.head(model_url, allow_redirects=True) new_etag = r.headers['ETag'] try: with open(f'{file_path}.etag', 'r') as f: # noqa old_etag = f.read() except IOError: old_etag = None if old_etag != new_etag: try: urlretrieve(model_url, f'{file_path}.xz') except HTTPError: logger.exception('Tool {}'.format(self.name())) return file_path with lzma.open(f'{file_path}.xz', 'rb') as input_f: # noqa with open(file_path, 'wb') as output_f: shutil.copyfileobj(input_f, output_f) with open(f'{file_path}.etag', 'w') as f: # noqa f.write(new_etag) return file_path
def retrieve_model(name): os.makedirs(MODELS_DIR, exist_ok=True) file_name = f"{name}model" file_path = os.path.join(MODELS_DIR, file_name) base_model_url = BASE_URL.format(name) model_url = f"{base_model_url}/{file_name}.xz" LOGGER.info(f"Checking ETAG of {model_url}") r = requests.head(model_url, allow_redirects=True) r.raise_for_status() new_etag = r.headers["ETag"] try: with open(f"{file_path}.etag", "r") as f: old_etag = f.read() except IOError: old_etag = None if old_etag != new_etag: LOGGER.info(f"Downloading the model from {model_url}") urlretrieve(model_url, f"{file_path}.xz") with lzma.open(f"{file_path}.xz", "rb") as input_f: with open(file_path, "wb") as output_f: shutil.copyfileobj(input_f, output_f) LOGGER.info(f"Written model in {file_path}") with open(f"{file_path}.etag", "w") as f: f.write(new_etag) else: LOGGER.info(f"ETAG for {model_url} is ok") return file_path
def handle_savegame(root, file): time.sleep(1) filename = os.path.join(root, file) print("Handling savegame: " + filename) txt = None with lzma.open(filename, mode="rt") as f: txt = f.read().split("\n") status.savegames_read += 1 new_filename = "pbem_processed_" + str(random.randint(0, 10000000000)) + ".xz" f.close() shutil.move(filename, os.path.join(root, new_filename)) print("New filename will be: " + new_filename) players = list_players(txt) phase = find_phase(txt) turn = find_turn(txt) game_id = find_game_id(txt) state = find_state(txt) print("game_id=" + str(game_id)) print("phase=" + str(phase)) print("turn=" + str(turn)) print("state=" + str(state)) print("players=" + str(players)) active_player = players[phase] print("active_player=" + active_player) active_email = find_email_address(active_player) status.games[game_id] = [turn, phase, players, time.ctime(), int(time.time()), state] if active_email != None: print("active email=" + active_email) m = MailSender() m.send_email(active_player, players, active_email, new_filename.replace(".xz", ""), turn) status.emails_sent += 1
def parse_candle(bi5, date, point=5): quote = {} if type(date) == str: date = dateutil.parser.parse(date).replace( tzinfo=pytz.utc, hour=0, minute=0, second=0, microsecond=0) s = struct.Struct('>L') try: with lzma.open(bi5[0]) as f: content = f.read() except EOFError: print('{}: File is not valid lzma file. Conitnue'.format(date)) return quote size = len(content) idx = 0 while idx < size: time_delta = s.unpack(content[idx:idx + 4])[0] price_open = s.unpack(content[idx + 4:idx + 8])[0] / 10 ** point price_high = s.unpack(content[idx + 8:idx + 12])[0] / 10 ** point price_low = s.unpack(content[idx + 12:idx + 16])[0] / 10 ** point price_close = s.unpack(content[idx + 16:idx + 20])[0] / 10 ** point volume = s.unpack(content[idx + 20:idx + 24])[0] last_candle = date.astimezone(pytz.utc) + timedelta(seconds=time_delta) try: quote[last_candle] except KeyError: quote[last_candle] = {} finally: quote[last_candle] = {'open': price_open, 'high': price_high, 'low': price_low, 'close': price_close, 'vol': volume} idx += 24 return quote
def finish(self, private_key): # Create package index. def write_entry(f, package, version): f.write(self._get_control_snippet(package, version)) filename = self._get_filename(package, version) path = os.path.join(self._new_path, filename) f.write( 'Filename: %s\n' 'Size: %u\n' 'SHA256: %s\n' % ( filename, os.path.getsize(path), util.sha256(path).hexdigest(), )) f.write('\n') index = os.path.join(self._new_path, 'Packages') with open(index, 'wt') as f, lzma.open(index + '.xz', 'wt') as f_xz: for package, version in self._packages: write_entry(f, package, version) write_entry(f_xz, package, version) # Link the index into the per-architecture directory. for arch in self._architectures: index_arch = os.path.join( self._new_path, 'dists/cloudabi/cloudabi/binary-%s/Packages' % arch) util.make_parent_dir(index_arch) os.link(index, index_arch) os.link(index + '.xz', index_arch + '.xz') checksum = util.sha256(index).hexdigest() checksum_xz = util.sha256(index + '.xz').hexdigest() size = os.path.getsize(index) size_xz = os.path.getsize(index + '.xz') os.unlink(index) os.unlink(index + '.xz') # Create the InRelease file. with open( os.path.join(self._new_path, 'dists/cloudabi/InRelease'), 'w' ) as f, subprocess.Popen([ 'gpg', '--local-user', private_key, '--armor', '--sign', '--clearsign', '--digest-algo', 'SHA256', ], stdin=subprocess.PIPE, stdout=f) as proc: def append(text): proc.stdin.write(bytes(text, encoding='ASCII')) append( 'Suite: cloudabi\n' 'Components: cloudabi\n' 'Architectures: %s\n' 'Date: %s\n' 'SHA256:\n' % ( ' '.join(sorted(self._architectures)), time.strftime("%a, %d %b %Y %H:%M:%S UTC", time.gmtime()))) for arch in sorted(self._architectures): append(' %s %d cloudabi/binary-%s/Packages\n' % (checksum, size, arch)) append(' %s %d cloudabi/binary-%s/Packages.xz\n' % (checksum_xz, size_xz, arch))
def open_lzma(file, *, mode, encoding=None, errors=None, newline=None, external=False, parallel=False): if external and EXTERNAL_XZ: args = [EXTERNAL_XZ, '-c', '-d'] return ProcessIOReader(args, file, mode, encoding, errors, newline) return lzma.open( file, mode=mode, encoding=encoding, errors=errors, newline=newline)
def _recompress_to_gz(xz, gz): import lzma import gzip with lzma.open(xz) as xzf, gzip.open(gz, mode='xb') as gzf: while True: block = xzf.read(1024 * 1024) if not block: break gzf.write(block)
def rebuild(rev): if rev is None: return [] c = get_db().execute("SELECT parent, patch FROM blobs WHERE id = ?", (sqlite3.Binary(rev),)) parent, patch_id = c.fetchone() with lzma.open(os.path.join(app.config["BASE_PATH"], binascii.hexlify(patch_id).decode("utf-8")), "rb") as f: patch_ = f.read().splitlines(True) return patch(rebuild(parent), patch_)
def load_food_desc(data_dir): fn = os.path.join(data_dir, 'food_desc.xz') with lzma.open(fn, 'rt', encoding='utf8') as inputfile: for l in inputfile: a = l.strip("\n\r").split('|') food_dict[a[0]] = ((a[1], a[2])) food_list_per_group[a[1]].append((a[0], a[2])) food_list.append(tuple(a))
def raw2np(filename, shape): if filename[-3:] == ".xz": sys.stdout.write("Decompressing data...\n") sys.stdout.flush() with lzma.open(filename) as decompf: return np.fromstring( decompf.read(), dtype=np.uint16).reshape(shape) else: return np.fromfile(filename, dtype=np.uint16).reshape(shape)
def load_icom_stream(icom_path): with lzma.open(icom_path, "r") as f: contents = f.read() return contents
def _index_file(pp: Path, opts: Options) -> Results: logger = get_logger() # TODO use kompress? # TODO not even sure if it's used... suf = pp.suffix.lower() if suf == '.xz': # TODO zstd? import lzma uname = pp.name[:-len('.xz')] uncomp = Path(get_tmpdir().name) / uname with lzma.open(pp, 'rb') as cf: with uncomp.open('wb') as fb: fb.write(cf.read()) yield from _index(path=uncomp, opts=opts) return # TODO dispatch org mode here? # TODO try/catch? if suf not in SMAP: pm = mime(pp) if pm not in SMAP: yield RuntimeError(f"Unexpected file extension: {pp}, {pm}") return else: ip = SMAP.get(pm, None) # TODO assume plaintext? else: ip = SMAP.get(suf, None) if ip is None: # TODO only log once? logger.debug('file type suppressed: %s', pp) return indexer: Union[Urls, Results] = ip(pp) # type: ignore # TODO careful, filter out obviously not plaintext? maybe mime could help here?? root = opts.root fallback_dt = datetime.fromtimestamp(pp.stat().st_mtime, tz=pytz.utc) fallback_loc = Loc.file(pp) replacer = opts.replacer for r in indexer: if isinstance(r, Exception): yield r continue if isinstance(r, EUrl): v = Visit( url=r.url, dt=fallback_dt, locator=fallback_loc, context='::'.join(r.ctx), ) else: v = r loc = v.locator if loc is not None and root is not None: # meh. but it works # todo potentially, just use dataclasses instead... loc = loc._replace(title=loc.title.replace(str(root) + '/', '')) v = v._replace(locator=loc) if replacer is not None: upd: Dict[str, Any] = {} href = v.locator.href if href is not None: upd['locator'] = v.locator._replace(href=replacer(href), title=replacer(v.locator.title)) ctx = v.context if ctx is not None: # TODO in context, http is unnecessary upd['context'] = replacer(ctx) v = v._replace(**upd) yield v
def read_file(filepath): assert os.access(filepath, os.R_OK) with lzma.open(filepath, 'rt', encoding='utf-8') as hpx_output_handle: hpx_output = hpx_output_handle.read() return hpx_output
def __init__(self, word_list=None): if word_list is None: word_list = 'eff-long' with lzma.open(wordlist_path('{}.txt.xz'.format(word_list))) as f: self.wordlist = f.read().decode().strip().split('\n')
def cached_path( url_or_filename, download_config=None, **download_kwargs, ) -> Optional[str]: """ Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and then return the path. Return: Local path (string) Raises: FileNotFoundError: in case of non-recoverable file (non-existent or no cache on disk) ConnectionError: in case of unreachable url and no cache on disk ValueError: if it couldn't parse the url or filename correctly requests.exceptions.ConnectionError: in case of internet connection issue """ if download_config is None: download_config = DownloadConfig(**download_kwargs) cache_dir = download_config.cache_dir or config.HF_DATASETS_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) if isinstance(url_or_filename, Path): url_or_filename = str(url_or_filename) if is_remote_url(url_or_filename): # URL, so get it from the cache (downloading if necessary) output_path = get_from_cache( url_or_filename, cache_dir=cache_dir, force_download=download_config.force_download, proxies=download_config.proxies, resume_download=download_config.resume_download, user_agent=download_config.user_agent, local_files_only=download_config.local_files_only, use_etag=download_config.use_etag, max_retries=download_config.max_retries, ) elif os.path.exists(url_or_filename): # File, and it exists. output_path = url_or_filename elif urlparse(url_or_filename).scheme == "" or os.path.ismount(urlparse(url_or_filename).scheme + ":/"): # File, but it doesn't exist. # On unix the scheme of a local path is empty, while on windows the scheme is the drive name (ex: "c") # for details on the windows behavior, see https://bugs.python.org/issue42215 raise FileNotFoundError("Local file {} doesn't exist".format(url_or_filename)) else: # Something unknown raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) if download_config.extract_compressed_file and output_path is not None: if ( not is_zipfile(output_path) and not tarfile.is_tarfile(output_path) and not is_gzip(output_path) and not is_xz(output_path) and not is_rarfile(output_path) ): return output_path # Path where we extract compressed archives # We extract in the cache dir, and get the extracted path name by hashing the original path" abs_output_path = os.path.abspath(output_path) output_path_extracted = os.path.join(cache_dir, "extracted", hash_url_to_filename(abs_output_path)) if ( os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not download_config.force_extract ) or (os.path.isfile(output_path_extracted) and not download_config.force_extract): return output_path_extracted # Prevent parallel extractions lock_path = output_path + ".lock" with FileLock(lock_path): shutil.rmtree(output_path_extracted, ignore_errors=True) os.makedirs(output_path_extracted, exist_ok=True) if tarfile.is_tarfile(output_path): tar_file = tarfile.open(output_path) tar_file.extractall(output_path_extracted) tar_file.close() elif is_gzip(output_path): os.rmdir(output_path_extracted) with gzip.open(output_path, "rb") as gzip_file: with open(output_path_extracted, "wb") as extracted_file: shutil.copyfileobj(gzip_file, extracted_file) elif is_zipfile(output_path): # put zip file to the last, b/c it is possible wrongly detected as zip with ZipFile(output_path, "r") as zip_file: zip_file.extractall(output_path_extracted) zip_file.close() elif is_xz(output_path): os.rmdir(output_path_extracted) with lzma.open(output_path) as compressed_file: with open(output_path_extracted, "wb") as extracted_file: shutil.copyfileobj(compressed_file, extracted_file) elif is_rarfile(output_path): if config.RARFILE_AVAILABLE: import rarfile rf = rarfile.RarFile(output_path) rf.extractall(output_path_extracted) rf.close() else: raise EnvironmentError("Please pip install rarfile") else: raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) return output_path_extracted return output_path
def smart_open(filename, *args, **kwargs): if filename.endswith('.xz'): return lzma.open(filename, *args, **kwargs) else: return open(filename, *args, **kwargs)
if arch_pkg: arch_pkg.add_deb(info) # print(convertPackage(info, package_names + optional_names)) # get list of unique arch packages from package map arch_package_names=list(arch_packages.keys()) arch_package_names.sort() deb_package_names=[] print(header_tpl.format( package_names="(" + " ".join( arch_package_names ) + ")", pkgver=pkgver, pkgrel=pkgrel, dlagents=dlagents, source="\n\t".join(sources), sha256sums="\n\t".join(sha256sums) )) print(package_functions) with lzma.open(source_file, "r") as tar: with tarfile.open(fileobj=tar) as tf: with tf.extractfile("amdgpu-pro-%s-%s/Packages" %(pkgver_base,pkgver_build)) as packages: writePackages(packages) for pkg in arch_package_names: print( arch_packages[pkg].toPKGBUILD() )
('bc', BaggingClassifier(verbose=True)) ] model = StackingClassifier(estimators=estimators, final_estimator=GBC(n_estimators=500, max_depth=20, verbose=True)) model.fit(train_data, train_target) print(f"Train set score: {accuracy_score(model.predict(train_data), train_target)}") print(f"Train set score: {accuracy_score(model.predict(test_data), test_target)}") # TODO: The trained model needs to be saved. All sklearn models can # be serialized and deserialized using the standard `pickle` module. # Additionally, we also compress the model. # # To save a model, open a target file for binary access, and use # `pickle.dump` to save the model to the opened file: with lzma.open(args.model_path, "wb") as model_file: pickle.dump(model, model_file) # The `recodex_predict` is called during ReCodEx evaluation (there can be # several Python sources in the submission, but exactly one should contain # a `recodex_predict` method). def recodex_predict(data): # The `data` is a pandas.DataFrame containt test set input. args = parser.parse_args([]) # TODO: Predict target values for the given data. # # You should probably start by loading a model. Start by opening the model # file for binary read access and then use `pickle.load` to deserialize the # model from the stored binary data:
def fixture_fileh(filename): return lzma.open(os.path.join(dirname, "../data", filename), "r")
def main(): print("<div style=\"border:1px solid black;\">", end="\n\n") print("`{bm-disable-all}`", end="\n\n") try: # *** THIS BLOCK LOADS IN THE SEQUENCE AND GENES FOR BAKER'S YEAST *** fasta_filepath = 'GCA_000146045.2_R64_genomic.fna.xz' with lzma.open(fasta_filepath, mode='rt', encoding='utf-8') as f: lines = f.read().splitlines() lines = [line for line in lines if not line.startswith('>')] seq = ''.join(lines).upper() genes_filepath = 'GCA_000146045.2_R64_gene_result.txt.xz' with lzma.open(genes_filepath, mode='rt', encoding='utf-8') as f: csv_reader = csv.reader(f, delimiter='\t') genes_data = [] for row in csv_reader: genes_data.append(row) genes = [] for row in genes_data[1:]: gene_start_pos_str = row[12].strip() if gene_start_pos_str == '': continue gene_name = row[5].strip() gene_start_pos = int(gene_start_pos_str) genes.append((gene_name, gene_start_pos)) genes.sort(key=lambda g: g[1]) # This is an artificial example. I went through yeast motifs in http://motifmap.ics.uci.edu/ and picked one out # (DIG1). Then I went through NCBI and tried to find the genome and gene list for this particular strain of # yeast. I searched through the sequence and known gene locations to pull out gene upstream regions that # contained this motif. # # This is the closest I could get to a practical example short of doing my own experiments, which I don't have # the equipment or wherewithal to do. # # Motifs: http://motifmap.ics.uci.edu/ (Click on motif search and select yeast -- it will display all motifs) # Sequence: https://www.ncbi.nlm.nih.gov/assembly/GCF_000146045.2 # Gene list: https://www.ncbi.nlm.nih.gov/gene/?term=txid559292%5BOrganism%3Anoexp%5D+DIG1 # # To speed the example up, I only used a subset of the found gene upstreams. # *** THIS BLOCK SEARCHES THE SEQUENCE FOR MOTIF MEMBERS OF DIG1 *** # search_locs = [] # for m in re.finditer(r'AAA..[AG]AA.GA[AG][AG]AA.A[AG]', seq): # This is the motif for DIG1 # start_idx, end_idx = m.span() # motif_member = m.string[start_idx:end_idx] # closest_gene =\ # min( # map( # lambda g: (g[0], g[1] - start_idx, g[1]), # filter( # lambda g: g[1] > start_idx, # genes # ) # ), # key=lambda g: g[1], # default=None # ) # if closest_gene is not None and closest_gene[1] <= 2000: # print(f'Found {motif_member} {start_idx}, closest gene is {closest_gene}') # search_locs.append((closest_gene[2] - 2000, closest_gene[2])) # for start_idx, end_idx in search_locs: # print(f'seq[{start_idx}:{end_idx}]') # # Found AAAAGGAAGGAAAAATAG 14779, closest gene is ('THI12', 53, 14832) # # Found AAACAAAAAGAAAAAAAG 65682, closest gene is ('TOS6', 62, 65744) # # Found AAAAGAAAAGAGAAATAG 67732, closest gene is ('snR85', 36, 67768) # # Found AAAAAAAAGGAAAAAAAG 70180, closest gene is ('YHL017W', 96, 70276) # # Found AAAGAAAAAGAAAAAAAA 128183, closest gene is ('SYN8', 69, 128252) # # Found AAAAGAAAAGAAAAAAAG 172014, closest gene is ('YPL199C', 19, 172033) # # Found AAACGGAATGAGGAATAA 183306, closest gene is ('RPC53', 37, 183343) # # Found AAAAAAAACGAAAAAAAA 268978, closest gene is ('CDC5', 41, 269019) # # Found AAAAAAAAGGAAAAAGAA 293881, closest gene is ('YBR027C', 143, 294024) # # Found AAAGAAAAAGAAAAAGAA 404722, closest gene is ('YCK3', 91, 404813) # # Found AAACGGAATGAGGAATAA 451419, closest gene is ('MEH1', 15, 451434) # # Found AAACGGAATGAGGAATAA 457003, closest gene is ('KTR6', 115, 457118) # # Found AAAAAAAACGAGAAAAAG 488333, closest gene is ('MSK1', 53, 488386) # # Found AAACGGAATGAGGAATAA 489960, closest gene is ('VPS21', 236, 490196) # # Found AAACGGAATGAGGAATAA 495545, closest gene is ('SHE3', 47, 495592) # # Found AAACGGAATGAGGAATAA 557448, closest gene is ('TIF34', 33, 557481) # # Found AAAAAAAATGAAAAACAA 590680, closest gene is ('GRR1', 192, 590872) # # Found AAACGAAACGAAGAAAAA 645845, closest gene is ('YDR098C-B', 13, 645858) # # Found AAAGAAAAAGAGAAATAA 760834, closest gene is ('BSC5', 289, 761123) # # Found AAAAAGAAAGAAAAAAAG 779839, closest gene is ('IRC13', 31, 779870) # # Found AAAGCAAAAGAAGAAAAA 780606, closest gene is ('DFR1', 300, 780906) # # Found AAAACAAACGAAAAAAAA 783720, closest gene is ('ECL1', 503, 784223) # # Found AAAGAAAATGAAAAAAAA 791430, closest gene is ('STB3', 918, 792348) # # Found AAACGAAAGGAGAAATAA 873685, closest gene is ('FBP1', 61, 873746) # # Found AAAGAAAAGGAAAAAAAG 1116394, closest gene is ('YCG1', 732, 1117126) # # Found AAACGGAATGAGGAATAA 1126271, closest gene is ('UBX5', 1601, 1127872) # # Found AAACGGAATGAGGAATAA 1195086, closest gene is ('BCP1', 326, 1195412) # # Found AAACAAAAAGAAAAACAA 1195582, closest gene is ('TFC6', 1097, 1196679) # # Found AAACGGAATGAGGAATAA 1212779, closest gene is ('KEI1', 69, 1212848) # *** THIS BLOCK RUNS MOTIF FINDING ALGO ON THE GENE UPSTREAM REGIONS -- FOUND MOTIF SHOULD BE FOR DIG1 *** # note: some gene upstream regions were commented out to speed up motif finding gene_upstreams = [ seq[12832:14832], # THI12 # seq[63744:65744], # seq[65768:67768], seq[68276:70276], # YHL017W seq[126252:128252], # SYN8 # seq[170033:172033], # seq[181343:183343], # seq[267019:269019], # seq[292024:294024], # seq[402813:404813], # seq[449434:451434], # seq[455118:457118], # seq[486386:488386], # seq[488196:490196], # seq[493592:495592], # seq[555481:557481], # seq[588872:590872], # seq[643858:645858], # seq[759123:761123], # seq[777870:779870], # seq[778906:780906], # seq[782223:784223], # seq[790348:792348], # seq[871746:873746], seq[1115126:1117126], # YCG1 seq[1125872:1127872], # UBX5 # seq[1193412:1195412], # seq[1194679:1196679], seq[1210848:1212848] # KEI1 ] k = 18 # If we searched for a slightly larger or smaller k, we would likely get some hits that contain parts of # the correct motif members (k=18). I think the correct course of action is to play with k and see what # parts of the upstream regions light up. If they're consistently lighting up within the same parts, you # may be on the right track? # # Since this is an artificial example, we already know that k=18. print( f'Organism is baker\'s yeast. Suspected genes influenced by transcription factor: THI12, YHL017W, SYN8,' f' YCG1, UBX5, and KEI1.', end="\n\n") print( f'Searching for {k}-mer across a set of {len(gene_upstreams)} gene upstream regions...', end="\n\n") best_motif_matrix = None for iteration in range(200): found_motif_matrix = randomized_motif_search_with_psuedocounts( k, gene_upstreams) if best_motif_matrix is None or score_motif( found_motif_matrix) < score_motif(best_motif_matrix): best_motif_matrix = found_motif_matrix print(f'{"<br>".join(best_motif_matrix)}', end="\n\n") print(f'Score is: {score_motif(best_motif_matrix)}', end="\n\n") finally: print("</div>", end="\n\n") print("`{bm-enable-all}`", end="\n\n")
'required': ['id', 'description', 'find', 'drs'], } jsonschema.validate(config, schema) drs_re = re.compile(config["drs"], re.VERBOSE) find_command = [ "/bin/find", *config["find"]["paths"], *shlex.split(config["find"].get("options", "")), ] print(shlex.join(find_command)) with tempfile.TemporaryFile('w+') as f, tempfile.TemporaryFile( 'w+') as s, lzma.open("catalogue.csv.xz", mode="wt", newline="") as out, lzma.open('errors.xz', mode='wt') as e: # Find files print("Finding Files...") find = subprocess.run(find_command, stdout=f) find.check_returncode() f.seek(0) # Sort the results print("Sorting Files...") sort = subprocess.run(["/bin/sort"], stdin=f, stdout=s) sort.check_returncode() s.seek(0) # Get the column names
def xopen(filename, mode='r'): """ Replacement for the "open" function that can also open files that have been compressed with gzip or bzip2. If the filename is '-', standard output (mode 'w') or input (mode 'r') is returned. If the filename ends with .gz, the file is opened with a pipe to the gzip program. If that does not work, then gzip.open() is used (the gzip module is slower than the pipe to the gzip program). If the filename ends with .bz2, it's opened as a bz2.BZ2File. Otherwise, the regular open() is used. mode can be: 'rt', 'rb', 'a', 'wt', or 'wb' Instead of 'rt' and 'wt', 'r' and 'w' can be used as abbreviations. In Python 2, the 't' and 'b' characters are ignored. Append mode ('a') is unavailable with BZ2 compression and will raise an error. """ if mode == 'r': mode = 'rt' elif mode == 'w': mode = 'wt' if mode not in ('rt', 'rb', 'wt', 'wb', 'a'): raise ValueError("mode '{0}' not supported".format(mode)) if not PY3: mode = mode[0] if not isinstance(filename, basestring): raise ValueError("the filename must be a string") # standard input and standard output handling if filename == '-': if not PY3: return sys.stdin if 'r' in mode else sys.stdout return dict(rt=sys.stdin, wt=sys.stdout, rb=sys.stdin.buffer, wb=sys.stdout.buffer)[mode] if filename.endswith('.bz2'): if bz2 is None: raise ImportError( "Cannot open bz2 files: The bz2 module is not available") if PY3: if 't' in mode: return io.TextIOWrapper(bz2.BZ2File(filename, mode[0])) else: return bz2.BZ2File(filename, mode) else: return bz2.BZ2File(filename, mode) elif filename.endswith('.xz'): if lzma is None: raise ImportError( "Cannot open xz files: The lzma module is not available (use Python 3.3 or newer)" ) return lzma.open(filename, mode) elif filename.endswith('.gz'): if PY3: if 't' in mode: return io.TextIOWrapper(gzip.open(filename, mode[0])) else: if 'r' in mode: return io.BufferedReader(gzip.open(filename, mode)) else: return io.BufferedWriter(gzip.open(filename, mode)) else: # rb/rt are equivalent in Py2 if 'r' in mode: try: return GzipReader(filename) except IOError: # gzip not installed return buffered_reader(gzip.open(filename, mode)) else: try: return GzipWriter(filename, mode) except IOError: return buffered_writer(gzip.open(filename, mode)) else: return open(filename, mode)
def get_stored_def(self, rid): with lzma.open(self.run_dir / str(rid) / STORED_DEF_NAME) as codefile: return codefile.read().decode('utf-8')
import lzma with open('X86_hello', 'r') as f: data=f.read().encode('utf-8') with lzma.open("X86_hello.xz", "w") as fd: fd.write(data)
import lzma from FindMaximalNonBranchingPaths import find_maximal_non_branching_paths from Kdmer import Kdmer from Read import Read from ReadPair import ReadPair from ToDeBruijnGraph import to_debruijn_graph reads_filepath = 'FinalChallengeReads.txt.xz' with lzma.open(reads_filepath, mode='rt', encoding='utf-8') as f: lines = f.read().splitlines() lines = [l.strip() for l in lines] # get rid of whitespace lines = [l for l in lines if len(l) > 0] # get rid of empty lines lines_split = [tuple(l.split('|', maxsplit=2)) for l in lines] kdmers = [Kdmer(k1, k2, 1000) for k1, k2 in lines_split] rps = [ReadPair(kdmer) for kdmer in kdmers] broken_rps = [broken_rp for rp in rps for broken_rp in rp.shatter(40)] broken_rps = list(set(broken_rps)) graph = to_debruijn_graph(broken_rps) contig_paths = find_maximal_non_branching_paths(graph) contig_paths.sort(key=lambda x: len(x)) for path in contig_paths: if len(path) >= path[0].d: out = path[0].stitch(path) print(f'{len(path)} kd-mers = {out}') else:
default=False) oparser.add_argument('--cleanhtml', action='store_true', help='Clean HTML to remove javascript, css and head tags', default=False) options = oparser.parse_args() logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO if options.verbose else logging.ERROR, datefmt='%Y-%m-%d %H:%M:%S') f = None fo = None if options.input[-3:] == ".xz": f = ArchiveIterator(lzma.open(options.input, 'r')) elif options.input[-3:] == ".gz": f = ArchiveIterator(open(options.input, 'rb')) elif options.input == sys.stdin: f = ArchiveIterator(options.input.buffer) else: f = ArchiveIterator(open(options.input, 'rb')) if options.output == sys.stdout: fo = WARCWriter(options.output.buffer, gzip=True) else: fo = WARCWriter(open(options.output, 'wb'), gzip=True) if options.pdfextract: extractor = ExtrP()
if filename.endswith('.gz'): self.fd = gzip.open(filename, 'rb') try: # read a bit to make sure it's a gzip file self.fd.read(10) self.fd.seek(0, 0) except Exception, e: print >> log, "[EPGImport] File downloaded is not a valid gzip file", filename self.downloadFail(e) return elif filename.endswith('.xz') or filename.endswith('.lzma'): try: import lzma except ImportError: from backports import lzma self.fd = lzma.open(filename, 'rb') try: # read a bit to make sure it's an xz file self.fd.read(10) self.fd.seek(0, 0) except Exception, e: print >> log, "[EPGImport] File downloaded is not a valid xz file", filename self.downloadFail(e) return else: self.fd = open(filename, 'rb') if deleteFile and self.source.parser != 'epg.dat': try: print >> log, "[EPGImport] unlink", filename os.unlink(filename) except Exception, e:
with open('data/cases.csv', mode='w') as csvfile: fieldnames = [ 'id', 'url', 'name', 'name_abbreviation', 'body', 'decision_date', 'decision_year', 'decision_month', 'docket_number', 'first_page', 'last_page', 'frontend_url', 'citations_count', 'citation_0_type', 'citation_0', 'citation_1_type', 'citation_1', 'citation_2_type', 'citation_2', 'volume_barcode', 'volume_number', 'reporter_name', "reporter_id", 'court_id', 'court_name', 'jurisdiction_id', 'jurisdiction_name' ] output = csv.DictWriter(csvfile, fieldnames=fieldnames) output.writeheader() for state in ['North Carolina', 'Arkansas', 'Illinois', 'New Mexico']: with lzma.open(f'{state}-20200302-xml/data/data.jsonl.xz', mode='r') as in_file: for line in in_file: case = json.loads(str(line, 'utf8')) try: decision_date = parser.parse(case['decision_date']) except ParserError as e: # if date is out of range, parse year & month decision_date = parser.parse(case['decision_date'][:7]) case['decision_year'] = decision_date.year case['decision_month'] = decision_date.month for i, citation in enumerate(case['citations']): if i > 3: print(f"more than {len(case['citations'])} citations") break
help='Column that contains the first document of the document pairs', default=0, type=int) parser.add_argument( '--column2', help='Column that contains the second document of the document pairs', default=1, type=int) args = parser.parse_args() lang2_docs = set() lang2_read_docs = {} if args.indices[:-3] == '.xz': reader = lzma.open(args.indices, 'rt') elif args.indices[:-3] == '.gz': reader = gzip.open(args.indices, 'rt') else: reader = open(args.indices, 'r') for line in reader: fields = line.split('\t') lang2_docs.add(int(fields[args.column2])) reader.seek(0) with open_xz_or_gzip_or_plain(args.tokenized1) as tok_reader1, \ open_xz_or_gzip_or_plain(args.tokenized2) as tok_reader2, \ open_xz_or_gzip_or_plain(args.text1) as text_reader1, \ open_xz_or_gzip_or_plain(args.text2) as text_reader2:
def get_node(self, disconnect=None, options=None, may_fail=False, may_reconnect=False, random_hsm=False, feerates=(15000, 7500, 3750), start=True, log_all_io=False, dbfile=None, node_id=None, allow_broken_log=False, wait_for_bitcoind_sync=True, allow_bad_gossip=False): if not node_id: node_id = self.get_node_id() port = self.get_next_port() lightning_dir = os.path.join( self.directory, "lightning-{}/".format(node_id)) if os.path.exists(lightning_dir): shutil.rmtree(lightning_dir) socket_path = os.path.join(lightning_dir, "lightning-rpc").format(node_id) daemon = LightningD( lightning_dir, bitcoindproxy=self.bitcoind.get_proxy(), port=port, random_hsm=random_hsm, node_id=node_id ) # If we have a disconnect string, dump it to a file for daemon. if disconnect: daemon.disconnect_file = os.path.join(lightning_dir, "dev_disconnect") with open(daemon.disconnect_file, "w") as f: f.write("\n".join(disconnect)) daemon.opts["dev-disconnect"] = "dev_disconnect" if log_all_io: assert DEVELOPER daemon.env["LIGHTNINGD_DEV_LOG_IO"] = "1" daemon.opts["log-level"] = "io" if DEVELOPER: daemon.opts["dev-fail-on-subdaemon-fail"] = None daemon.env["LIGHTNINGD_DEV_MEMLEAK"] = "1" if os.getenv("DEBUG_SUBD"): daemon.opts["dev-debugger"] = os.getenv("DEBUG_SUBD") if VALGRIND: daemon.env["LIGHTNINGD_DEV_NO_BACKTRACE"] = "1" if not may_reconnect: daemon.opts["dev-no-reconnect"] = None if options is not None: daemon.opts.update(options) rpc = LightningRpc(socket_path, self.executor) node = LightningNode(daemon, rpc, self.bitcoind, self.executor, may_fail=may_fail, may_reconnect=may_reconnect, allow_broken_log=allow_broken_log, allow_bad_gossip=allow_bad_gossip) # Regtest estimatefee are unusable, so override. node.set_feerates(feerates, False) self.nodes.append(node) if VALGRIND: node.daemon.cmd_prefix = [ 'valgrind', '-q', '--trace-children=yes', '--trace-children-skip=*python*,*bitcoin-cli*', '--error-exitcode=7', '--log-file={}/valgrind-errors.%p'.format(node.daemon.lightning_dir) ] if dbfile: out = open(os.path.join(node.daemon.lightning_dir, 'lightningd.sqlite3'), 'xb') with lzma.open(os.path.join('tests/data', dbfile), 'rb') as f: out.write(f.read()) if start: try: node.start(wait_for_bitcoind_sync) except Exception: node.daemon.stop() raise return node
def run(self): """Runs the UpdateThread """ if self._adb is None: self.onError.emit('ADB not set') return if not self._adb.min_required: self.onError.emit('ADB MinRequired') return if not utils.is_connected(): self.onError.emit('Not connected') return if self._frida_update_url is None or self._frida_update_url == '': self.onError.emit('Missing frida download url') return self.onStatusUpdate.emit('Downloading latest frida') try: if utils.is_connected(): request = requests.get(self._frida_update_url, stream=True) else: self.onError.emit('Not connected') return except requests.ConnectionError: self.onError.emit('Failed to download latest frida') return # reset url self._frida_update_url = None if request is not None and request.status_code == 200: # write data to local file try: with open('frida.xz', 'wb') as frida_archive: for chunk in request.iter_content(chunk_size=1024): if chunk: frida_archive.write(chunk) except EnvironmentError: self.onError.emit('Failed to write frida.xz') return # start extraction if os.path.exists('frida.xz'): self.onStatusUpdate.emit('Extracting latest frida') try: with lzma.open('frida.xz') as frida_archive: with open('frida-server', 'wb') as frida_binary: frida_binary.write(frida_archive.read()) # remove downloaded archive os.remove('frida.xz') except lzma.LZMAError: self.onError.emit('Failed to extract frida.xz') return except EnvironmentError: self.onError.emit('Failed to write frida') return else: self.onError.emit('Failed to open frida.xz') return self.onStatusUpdate.emit('Mounting devices filesystem') # mount system rw if self._adb.mount_system(): self.onStatusUpdate.emit('Pushing to device') # push file to device self._adb.push('frida-server', '/sdcard/') self.onStatusUpdate.emit('Setting up and starting frida') # kill frida self._adb.kill_frida() _device_path = '/system/xbin' res = self._adb.su_cmd('ls ' + _device_path) if 'No such file or directory' in res: # use /system/bin _device_path = _device_path.replace('x', '') # copy file note: mv give sometimes a invalid id error self._adb.su_cmd('cp /sdcard/frida-server ' + _device_path + '/frida-server') # remove file self._adb.su_cmd('rm ' + _device_path + '/frida') # remove old named file self._adb.su_cmd('rm /sdcard/frida-server') # just to make sure self._adb.su_cmd('chown root:root ' + _device_path + '/frida-server') # make it executable self._adb.su_cmd('chmod 06755 ' + _device_path + '/frida-server') # start it if self._adb.get_frida_version(): if not self._adb.start_frida(): self.onError.emit( 'Failed to start fridaserver on Device') else: print('failed to mount /system on device') # delete extracted file if os.path.exists('frida-server'): os.remove('frida-server') else: self.onError.emit('Failed to download latest frida! Error: %d' % request.status_code) return self.onFinished.emit()
def RequestThread(ReqType, ReqMsg): global isConnected record = [] if isConnected: startTime = datetime.now() record.append('\'' + str(startTime)) record.append(ReqType) try: #Send request to server clientSocket.send(ReqMsg.encode()) except socket.error as msg: now = str(datetime.now())[:-7] LoggingText.insert( 'insert', '{0}: Server Connected failed({1})\n'.format(now, msg)) isConnected = False else: LoggingText.insert('insert', '{} request sent\n'.format(ReqType)) #Receive message from server response = clientSocket.recv(1024) if response: LoggingText.insert( 'insert', 'Response from server: {0} \n'.format( response.decode('utf-8'))) expectedResponse = '{} request accepted'.format(ReqType) if response.decode() == expectedResponse: filepath = SourceFilePathVar.get() filename = os.path.basename(filepath) filesize = os.stat(filepath).st_size record.append(filesize) if os.path.isfile(filepath): if IsCompressedVar.get() == 1: zipStartTime = datetime.now() #Zipfile compression # zipfilename = filename.split('.')[0] + '.zip' # with zipfile.ZipFile(zipfilename, 'w', zipfile.ZIP_DEFLATED) as f: # f.write(filename) #lzma compression zipfilename = filename.split('.')[0] + '.xz' with lzma.open(zipfilename, 'wb') as f: with open(filename, 'rb') as pf: textContent = pf.read() f.write(textContent) filepath = zipfilename filename = zipfilename filesize = os.stat(filepath).st_size record.append(filesize) zipDuration = datetime.now() - zipStartTime record.append('\'' + str(zipDuration)) else: record.append('None') record.append(0) #Send file info to server #Header structure : file name lentgh = 128 bytes; filesize = 8bytes; IsCompressed = 4bytes(int) fhead = struct.pack('128sQI', bytes(filename.encode('utf-8')), filesize, IsCompressedVar.get()) clientSocket.send(fhead) LoggingText.insert( 'insert', '{} file header sent\n'.format(ReqType)) sendStartTime = datetime.now() #Send data to server with open(filepath, 'rb') as fp: data = fp.read() clientSocket.sendall(data) LoggingText.insert( 'insert', '{} file send over...\n'.format(ReqType)) sendoverTime = datetime.now() sendoverDetal = sendoverTime - sendStartTime record.append('\'' + str(sendoverDetal)) LoggingText.insert( 'insert', 'Waiting for server processing and feedback\n') rcvStartTime = datetime.now() #4. Receive the processed result fileinfo_size = struct.calcsize('128sQI') fileinfo_data = clientSocket.recv(fileinfo_size) if fileinfo_data: filename, filesize, IsCompressed = struct.unpack( '128sQI', fileinfo_data) rcv_file_name = filename.decode('utf-8').strip( '\x00') LoggingText.insert( 'insert', 'Processed file header info is received for {}\n' .format(ReqType)) received_size = 0 with open(rcv_file_name, 'wb') as rcv_file_handle: while not (received_size == filesize): if (filesize - received_size > 4096): data = clientSocket.recv(4096) if data: received_size += len(data) else: isConnected = False break else: data = clientSocket.recv(filesize - received_size) if data: received_size = filesize else: isConnected = False break rcv_file_handle.write(data) LoggingText.insert( 'insert', 'Processed file for {} is received\n'.format( ReqType)) reverseoverDetal = datetime.now() - rcvStartTime record.append('\'' + str(reverseoverDetal)) if isConnected: if IsCompressed: LoggingText.insert( 'insert', 'Processed file for {} was compressed\n' .format(ReqType)) unzipStartTime = datetime.now() # with zipfile.ZipFile(rcv_file_name, 'r') as zf: # filepath = zf.extract(zf.namelist()[0]) #suppose only one file # #rcv_file_name = os.path.basename(filepath) # rcv_file_name = filepath with lzma.open(rcv_file_name, 'rb') as f: zipContent = f.read() localFileName = 'ReceivedProcessedFor{}.txt'.format( ReqType) with open(localFileName, 'w') as uf: uf.write( zipContent.decode("utf-8")) #Dsiplay partial content in GUI ProcessedFileText.delete(1.0, 'end') ProcessedFileText.insert( 'insert', zipContent.decode("utf-8")[0:1000]) unzipDuration = datetime.now( ) - unzipStartTime record.append('\'' + str(unzipDuration)) LoggingText.insert( 'insert', 'Processed file for {} is decompressed\n' .format(ReqType)) else: record.append(0) total_duration = datetime.now() - startTime record.append('\'' + str(total_duration)) #with open(rcv_file_name,'rb') as rf: # all_data_str = rf.read().decode('utf-8') #LoggingText.insert('insert', 'Processed file is stored locally\n') #5. Display the replaced result #ProcessedFileText.delete(1.0,'end') #ProcessedFileText.insert('insert', all_data_str[0:2000]) #ProcessedFileText.insert('insert', 'Processed data received') else: LoggingText.insert( 'insert', 'No connection! Please connect firstly\n') else: LoggingText.insert('insert', 'The file path is not valid') else: isConnected = False LoggingText.insert('insert', 'No connection! Please connect firstly\n') else: LoggingText.insert('insert', 'No connection! Please connect firstly\n') print('Request thread for {} ended'.format(ReqType)) with open('record.csv', 'a+') as csv_record: csv_write = csv.writer(csv_record) csv_write.writerow(record)
colorama.init(autoreset=True, strip=False) if args.dark: color_type = colorama.Fore.BLUE + colorama.Style.BRIGHT color_obj = colorama.Fore.YELLOW + colorama.Style.NORMAL else: color_type = colorama.Fore.BLUE + colorama.Style.NORMAL color_obj = colorama.Fore.YELLOW + colorama.Style.DIM # next-char values which will trigger ignoreself ignorechars = set([':', '.']) # Loop through and search with os.scandir(os.path.join('resources', game, 'dumps')) as it: for entry in sorted(it, key=lambda e: getattr(e, 'name').lower()): if entry.name[-8:] == '.dump.xz' or entry.name[-7:] == '.txt.xz': with lzma.open(entry.path, 'rt', encoding='latin1') as df: cur_obj = None cur_type = None found_result = False for line in df.readlines(): match = re.search( '\*\*\* Property dump for object \'(\S+) (\S+?)\' \*\*\*', line) if match: cur_type = match.group(1) cur_obj = match.group(2) if args.ignoreself and cur_obj.lower().startswith( ignore_search_str): if len(cur_obj) > len(ignore_search_str): if cur_obj[len( ignore_search_str)] in ignorechars:
def save_compressed(filename): #create lzma compressed version xz_filename = filename + '.xz' with lzma.open(xz_filename, 'wt', preset=9) as f: with open(filename, 'r') as content_file: f.write(content_file.read())
def main(): parser = argparse.ArgumentParser( prog=_program, description= 'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages', usage='''pangolin <query> [options]''') parser.add_argument('query', nargs="*", help='Query fasta file of sequences to analyse.') parser.add_argument( '-o', '--outdir', action="store", help="Output directory. Default: current working directory") parser.add_argument( '--outfile', action="store", help="Optional output file name. Default: lineage_report.csv") parser.add_argument('--alignment', action="store_true", help="Optional alignment output.") parser.add_argument( '-d', '--datadir', action='store', dest="datadir", help= "Data directory minimally containing a fasta alignment and guide tree") parser.add_argument( '--tempdir', action="store", help="Specify where you want the temp stuff to go. Default: $TMPDIR") parser.add_argument( "--no-temp", action="store_true", help="Output all intermediate files, for dev purposes.") parser.add_argument( '--decompress-model', action="store_true", dest="decompress", help= "Permanently decompress the model file to save time running pangolin.") parser.add_argument( '--max-ambig', action="store", default=0.5, type=float, help= "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.5", dest="maxambig") parser.add_argument( '--min-length', action="store", default=25000, type=int, help= "Minimum query length allowed for pangolin to attempt assignment. Default: 25000", dest="minlen") parser.add_argument('--panGUIlin', action='store_true', help="Run web-app version of pangolin", dest="panGUIlin") parser.add_argument("--verbose", action="store_true", help="Print lots of stuff to screen") parser.add_argument("-t", "--threads", action="store", help="Number of threads") parser.add_argument("-v", "--version", action='version', version=f"pangolin {__version__}") parser.add_argument("-pv", "--pangoLEARN-version", action='version', version=f"pangoLEARN {pangoLEARN.__version__}", help="show pangoLEARN's version number and exit") parser.add_argument( "--update", action='store_true', default=False, help= "Automatically updates to latest release of pangolin and pangoLEARN, then exits" ) compression = parser.add_mutually_exclusive_group() compression.add_argument("--gzip", action="store_true", help="Query files are gzip-compressed.") compression.add_argument("--xz", action="store_true", help="Query files are xz-compressed.") if len(sys.argv) == 1: parser.print_help() sys.exit(-1) args = parser.parse_args() if args.update: update(__version__, pangoLEARN.__version__) snakefile = os.path.join(thisdir, 'scripts', 'pangolearn.smk') if not os.path.exists(snakefile): sys.stderr.write( 'Error: cannot find Snakefile at {}\n'.format(snakefile)) sys.exit(-1) pfunk.check_installs() # to enable not having to pass a query if running update # by allowing query to accept 0 to many arguments if len(args.query) > 1: print( pfunk.cyan( f"Error: Too many query (input) fasta files supplied: {args.query}\nPlease supply one only" )) parser.print_help() sys.exit(-1) else: # find the query fasta query = os.path.join(cwd, args.query[0]) if not os.path.exists(query): sys.stderr.write( 'Error: cannot find query (input) fasta file at {}\n' 'Please enter your fasta sequence file and refer to pangolin usage at:\n' 'https://github.com/hCoV-2019/pangolin#usage\n' ' for detailed instructions\n'.format(query)) sys.exit(-1) else: print(pfunk.green(f"The query file is:") + f"{query}") # default output dir if args.outdir: outdir = os.path.join(cwd, args.outdir) if not os.path.exists(outdir): try: os.mkdir(outdir) except: sys.stderr.write( pfunk.cyan(f'Error: cannot create directory:') + f"{outdir}") sys.exit(-1) else: outdir = cwd if args.outfile: outfile = os.path.join(outdir, args.outfile) else: outfile = os.path.join(outdir, "lineage_report.csv") if args.tempdir: to_be_dir = os.path.join(cwd, args.tempdir) if not os.path.exists(to_be_dir): os.mkdir(to_be_dir) temporary_directory = tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=to_be_dir) tempdir = temporary_directory.name else: temporary_directory = tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=None) tempdir = temporary_directory.name if args.no_temp: print( pfunk.green(f"--no-temp:") + f"all intermediate files will be written to {outdir}") tempdir = outdir if args.alignment: align_dir = outdir alignment_out = True else: align_dir = tempdir alignment_out = False if args.threads: print( pfunk.cyan( f"\n--threads flag used, but threading not currently supported. Continuing with one thread." )) """ QC steps: 1) check no empty seqs 2) check N content 3) write a file that contains just the seqs to run """ do_not_run = [] run = [] if args.gzip: # user says input FASTA file is gzip-compressed, use gzip module to stream text to SeqIO query = gzip.open(query, 'rt') # replace file path (str) with handle if args.xz: query = lzma.open(query, 'rt') for record in SeqIO.parse(query, "fasta"): # replace spaces in sequence headers with underscores record.description = record.description.replace(' ', '_') record.id = record.description if "," in record.id: record.id = record.id.replace(",", "_") if len(record) < args.minlen: record.description = record.description + f" fail=seq_len:{len(record)}" do_not_run.append(record) print(record.id, "\tsequence too short") else: num_N = str(record.seq).upper().count("N") prop_N = round(num_N / len(record.seq), 2) if prop_N > args.maxambig: record.description = record.description + f" fail=N_content:{prop_N}" do_not_run.append(record) print(f"{record.id}\thas an N content of {prop_N}") else: run.append(record) if run == []: with open(outfile, "w") as fw: fw.write( "taxon,lineage,conflict,pangolin_version,pangoLEARN_version,pango_version,status,note\n" ) for record in do_not_run: desc = record.description.split(" ") reason = "" for item in desc: if item.startswith("fail="): reason = item.split("=")[1] fw.write( f"{record.id},None,NA,{__version__},{pangoLEARN.__version__},PANGO_VERSION,fail,{reason}\n" ) print(pfunk.cyan(f'Note: no query sequences have passed the qc\n')) sys.exit(0) post_qc_query = os.path.join(tempdir, 'query.post_qc.fasta') with open(post_qc_query, "w") as fw: SeqIO.write(run, fw, "fasta") qc_fail = os.path.join(tempdir, 'query.failed_qc.fasta') with open(qc_fail, "w") as fw: SeqIO.write(do_not_run, fw, "fasta") config = { "query_fasta": post_qc_query, "outdir": outdir, "outfile": outfile, "tempdir": tempdir, "aligndir": align_dir, "alignment_out": alignment_out, "trim_start": 265, # where to pad to using datafunk "trim_end": 29674, # where to pad after using datafunk "qc_fail": qc_fail, "pangoLEARN_version": pangoLEARN.__version__, "pangolin_version": __version__, "pango_version": PANGO_VERSION } # find the data data_dir = "" if args.datadir: data_dir = os.path.join(cwd, args.datadir) version = "Unknown" for r, d, f in os.walk(data_dir): for fn in f: if fn == "__init__.py": print("Found __init__.py") with open(os.path.join(r, fn), "r") as fr: for l in fr: if l.startswith("__version__"): l = l.rstrip("\n") version = l.split('=')[1] version = version.replace('"', "").replace(" ", "") print("pangoLEARN version", version) config["pangoLEARN_version"] = version if not args.datadir: pangoLEARN_dir = pangoLEARN.__path__[0] data_dir = os.path.join(pangoLEARN_dir, "data") print(f"Looking in {data_dir} for data files...") trained_model = "" header_file = "" lineages_csv = "" for r, d, f in os.walk(data_dir): for fn in f: if fn == "decisionTreeHeaders_v1.joblib": header_file = os.path.join(r, fn) elif fn == "decisionTree_v1.joblib": trained_model = os.path.join(r, fn) elif fn == "lineages.metadata.csv": lineages_csv = os.path.join(r, fn) if trained_model == "" or header_file == "" or lineages_csv == "": print( pfunk.cyan( "Check your environment, didn't find appropriate files from the pangoLEARN repo.\n" "Trained model must be installed, please see https://cov-lineages.org/pangolin.html " "for installation instructions.")) exit(1) else: if args.decompress: prev_size = os.path.getsize(trained_model) print("Decompressing model and header files") model = joblib.load(trained_model) joblib.dump(model, trained_model, compress=0) headers = joblib.load(header_file) joblib.dump(headers, header_file, compress=0) if os.path.getsize(trained_model) >= prev_size: print( pfunk.green( f'Success! Decompressed the model file. Exiting\n')) sys.exit(0) else: print( pfunk.cyan( f'Error: failed to decompress model. Exiting\n')) sys.exit(0) print(pfunk.green("\nData files found")) print(f"Trained model:\t{trained_model}") print(f"Header file:\t{header_file}") print(f"Lineages csv:\t{lineages_csv}") config["trained_model"] = trained_model config["header_file"] = header_file reference_fasta = pkg_resources.resource_filename('pangolin', 'data/reference.fasta') config["reference_fasta"] = reference_fasta variants_file = pkg_resources.resource_filename('pangolin', 'data/config_b.1.1.7.csv') config["b117_variants"] = variants_file variants_file = pkg_resources.resource_filename('pangolin', 'data/config_b.1.351.csv') config["b1351_variants"] = variants_file variants_file = pkg_resources.resource_filename('pangolin', 'data/config_p.1.csv') config["p1_variants"] = variants_file variants_file = pkg_resources.resource_filename('pangolin', 'data/config_p.2.csv') config["p2_variants"] = variants_file variants_file = pkg_resources.resource_filename('pangolin', 'data/config_p.3.csv') config["p3_variants"] = variants_file if args.panGUIlin: config["lineages_csv"] = lineages_csv if args.verbose: quiet_mode = False config["log_string"] = "" else: quiet_mode = True lh_path = os.path.realpath(lh.__file__) config["log_string"] = f"--quiet --log-handler-script {lh_path} " if args.verbose: print(pfunk.green("\n**** CONFIG ****")) for k in sorted(config): print(pfunk.green(k), config[k]) status = snakemake.snakemake(snakefile, printshellcmds=True, forceall=True, force_incomplete=True, workdir=tempdir, config=config, cores=1, lock=False) else: logger = custom_logger.Logger() status = snakemake.snakemake(snakefile, printshellcmds=False, forceall=True, force_incomplete=True, workdir=tempdir, config=config, cores=1, lock=False, quiet=True, log_handler=logger.log_handler) if status: # translate "success" into shell exit code of 0 return 0 return 1
[[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0]], ] reach_turn_points = [[0] * TURNS, [0] * TURNS, [0] * TURNS, [0] * TURNS, [0] * TURNS] reach_turn_counts = [[0] * TURNS, [0] * TURNS, [0] * TURNS, [0] * TURNS, [0] * TURNS] outcome_names = ('I won', 'Draw', 'Bystander', 'Other tsumod', 'I dealt in', 'Averages') for player in account_names: counter.player = player with lzma.open(directory_name + player + '.pickle.7z', 'rb') as infile: logs = pickle.load(infile) for key, log in logs.items(): if args.since and args.since > key[0:8]: continue if args.before and args.before <= key[0:8]: continue gamecount += 1 game = TenhouDecoder.Game(lang='DEFAULT', suppress_draws=False) game.decode(log['content'].decode()) counter.reach_outcomes = [] counter.addGame(game) for outcome in counter.reach_outcomes: # aggregate counter.reach_outcomes
def main(): module = AnsibleModule( argument_spec=dict( path=dict(type='list', required=True), format=dict(type='str', default='gz', choices=['bz2', 'gz', 'tar', 'xz', 'zip']), dest=dict(type='path'), exclude_path=dict(type='list'), force_archive=dict(type='bool', default=False), remove=dict(type='bool', default=False), ), add_file_common_args=True, supports_check_mode=True, ) params = module.params check_mode = module.check_mode paths = params['path'] dest = params['dest'] b_dest = None if not dest else to_bytes(dest, errors='surrogate_or_strict') exclude_paths = params['exclude_path'] remove = params['remove'] b_expanded_paths = [] b_expanded_exclude_paths = [] fmt = params['format'] b_fmt = to_bytes(fmt, errors='surrogate_or_strict') force_archive = params['force_archive'] globby = False changed = False state = 'absent' # Simple or archive file compression (inapplicable with 'zip' since it's always an archive) archive = False b_successes = [] # Fail early if not HAS_LZMA and fmt == 'xz': module.fail_json(msg=missing_required_lib( "lzma or backports.lzma", reason="when using xz format"), exception=LZMA_IMP_ERR) module.fail_json( msg="lzma or backports.lzma is required when using xz format.") for path in paths: b_path = os.path.expanduser( os.path.expandvars(to_bytes(path, errors='surrogate_or_strict'))) # Expand any glob characters. If found, add the expanded glob to the # list of expanded_paths, which might be empty. if (b'*' in b_path or b'?' in b_path): b_expanded_paths.extend(glob.glob(b_path)) globby = True # If there are no glob characters the path is added to the expanded paths # whether the path exists or not else: b_expanded_paths.append(b_path) # Only attempt to expand the exclude paths if it exists if exclude_paths: for exclude_path in exclude_paths: b_exclude_path = os.path.expanduser( os.path.expandvars( to_bytes(exclude_path, errors='surrogate_or_strict'))) # Expand any glob characters. If found, add the expanded glob to the # list of expanded_paths, which might be empty. if (b'*' in b_exclude_path or b'?' in b_exclude_path): b_expanded_exclude_paths.extend(glob.glob(b_exclude_path)) # If there are no glob character the exclude path is added to the expanded # exclude paths whether the path exists or not. else: b_expanded_exclude_paths.append(b_exclude_path) if not b_expanded_paths: return module.fail_json(path=', '.join(paths), expanded_paths=to_native( b', '.join(b_expanded_paths), errors='surrogate_or_strict'), msg='Error, no source paths were found') # Only try to determine if we are working with an archive or not if we haven't set archive to true if not force_archive: # If we actually matched multiple files or TRIED to, then # treat this as a multi-file archive archive = globby or os.path.isdir( b_expanded_paths[0]) or len(b_expanded_paths) > 1 else: archive = True # Default created file name (for single-file archives) to # <file>.<format> if not b_dest and not archive: b_dest = b'%s.%s' % (b_expanded_paths[0], b_fmt) # Force archives to specify 'dest' if archive and not b_dest: module.fail_json( dest=dest, path=', '.join(paths), msg= 'Error, must specify "dest" when archiving multiple files or trees' ) b_sep = to_bytes(os.sep, errors='surrogate_or_strict') b_archive_paths = [] b_missing = [] b_arcroot = b'' for b_path in b_expanded_paths: # Use the longest common directory name among all the files # as the archive root path if b_arcroot == b'': b_arcroot = os.path.dirname(b_path) + b_sep else: for i in range(len(b_arcroot)): if b_path[i] != b_arcroot[i]: break if i < len(b_arcroot): b_arcroot = os.path.dirname(b_arcroot[0:i + 1]) b_arcroot += b_sep # Don't allow archives to be created anywhere within paths to be removed if remove and os.path.isdir(b_path): b_path_dir = b_path if not b_path.endswith(b'/'): b_path_dir += b'/' if b_dest.startswith(b_path_dir): module.fail_json( path=', '.join(paths), msg= 'Error, created archive can not be contained in source paths when remove=True' ) if os.path.lexists(b_path) and b_path not in b_expanded_exclude_paths: b_archive_paths.append(b_path) else: b_missing.append(b_path) # No source files were found but the named archive exists: are we 'compress' or 'archive' now? if len(b_missing) == len(b_expanded_paths) and b_dest and os.path.exists( b_dest): # Just check the filename to know if it's an archive or simple compressed file if re.search( br'(\.tar|\.tar\.gz|\.tgz|\.tbz2|\.tar\.bz2|\.tar\.xz|\.zip)$', os.path.basename(b_dest), re.IGNORECASE): state = 'archive' else: state = 'compress' # Multiple files, or globbiness elif archive: if not b_archive_paths: # No source files were found, but the archive is there. if os.path.lexists(b_dest): state = 'archive' elif b_missing: # SOME source files were found, but not all of them state = 'incomplete' archive = None size = 0 errors = [] if os.path.lexists(b_dest): size = os.path.getsize(b_dest) if state != 'archive': if check_mode: changed = True else: try: # Slightly more difficult (and less efficient!) compression using zipfile module if fmt == 'zip': arcfile = zipfile.ZipFile( to_native(b_dest, errors='surrogate_or_strict', encoding='ascii'), 'w', zipfile.ZIP_DEFLATED, True) # Easier compression using tarfile module elif fmt == 'gz' or fmt == 'bz2': arcfile = tarfile.open( to_native(b_dest, errors='surrogate_or_strict', encoding='ascii'), 'w|' + fmt) # python3 tarfile module allows xz format but for python2 we have to create the tarfile # in memory and then compress it with lzma. elif fmt == 'xz': arcfileIO = io.BytesIO() arcfile = tarfile.open(fileobj=arcfileIO, mode='w') # Or plain tar archiving elif fmt == 'tar': arcfile = tarfile.open( to_native(b_dest, errors='surrogate_or_strict', encoding='ascii'), 'w') b_match_root = re.compile(br'^%s' % re.escape(b_arcroot)) for b_path in b_archive_paths: if os.path.isdir(b_path): # Recurse into directories for b_dirpath, b_dirnames, b_filenames in os.walk( b_path, topdown=True): if not b_dirpath.endswith(b_sep): b_dirpath += b_sep for b_dirname in b_dirnames: b_fullpath = b_dirpath + b_dirname n_fullpath = to_native( b_fullpath, errors='surrogate_or_strict', encoding='ascii') n_arcname = to_native( b_match_root.sub(b'', b_fullpath), errors='surrogate_or_strict') try: if fmt == 'zip': arcfile.write( n_fullpath, n_arcname) else: arcfile.add(n_fullpath, n_arcname, recursive=False) except Exception as e: errors.append( '%s: %s' % (n_fullpath, to_native(e))) for b_filename in b_filenames: b_fullpath = b_dirpath + b_filename n_fullpath = to_native( b_fullpath, errors='surrogate_or_strict', encoding='ascii') n_arcname = to_native( b_match_root.sub(b'', b_fullpath), errors='surrogate_or_strict') if not filecmp.cmp(b_fullpath, b_dest): try: if fmt == 'zip': arcfile.write( n_fullpath, n_arcname) else: arcfile.add(n_fullpath, n_arcname, recursive=False) b_successes.append(b_fullpath) except Exception as e: errors.append('Adding %s: %s' % (to_native(b_path), to_native(e))) else: path = to_native(b_path, errors='surrogate_or_strict', encoding='ascii') arcname = to_native(b_match_root.sub(b'', b_path), errors='surrogate_or_strict') if fmt == 'zip': arcfile.write(path, arcname) else: arcfile.add(path, arcname, recursive=False) b_successes.append(b_path) except Exception as e: expanded_fmt = 'zip' if fmt == 'zip' else ('tar.' + fmt) module.fail_json( msg='Error when writing %s archive at %s: %s' % (expanded_fmt, dest, to_native(e)), exception=format_exc()) if arcfile: arcfile.close() state = 'archive' if fmt == 'xz': with lzma.open(b_dest, 'wb') as f: f.write(arcfileIO.getvalue()) arcfileIO.close() if errors: module.fail_json( msg='Errors when writing archive at %s: %s' % (dest, '; '.join(errors))) if state in ['archive', 'incomplete'] and remove: for b_path in b_successes: try: if os.path.isdir(b_path): shutil.rmtree(b_path) elif not check_mode: os.remove(b_path) except OSError as e: errors.append(to_native(b_path)) if errors: module.fail_json(dest=dest, msg='Error deleting some source files: ', files=errors) # Rudimentary check: If size changed then file changed. Not perfect, but easy. if not check_mode and os.path.getsize(b_dest) != size: changed = True if b_successes and state != 'incomplete': state = 'archive' # Simple, single-file compression else: b_path = b_expanded_paths[0] # No source or compressed file if not (os.path.exists(b_path) or os.path.lexists(b_dest)): state = 'absent' # if it already exists and the source file isn't there, consider this done elif not os.path.lexists(b_path) and os.path.lexists(b_dest): state = 'compress' else: if module.check_mode: if not os.path.exists(b_dest): changed = True else: size = 0 f_in = f_out = arcfile = None if os.path.lexists(b_dest): size = os.path.getsize(b_dest) try: if fmt == 'zip': arcfile = zipfile.ZipFile( to_native(b_dest, errors='surrogate_or_strict', encoding='ascii'), 'w', zipfile.ZIP_DEFLATED, True) arcfile.write( to_native(b_path, errors='surrogate_or_strict', encoding='ascii'), to_native(b_path[len(b_arcroot):], errors='surrogate_or_strict')) arcfile.close() state = 'archive' # because all zip files are archives elif fmt == 'tar': arcfile = tarfile.open( to_native(b_dest, errors='surrogate_or_strict', encoding='ascii'), 'w') arcfile.add( to_native(b_path, errors='surrogate_or_strict', encoding='ascii')) arcfile.close() else: f_in = open(b_path, 'rb') n_dest = to_native(b_dest, errors='surrogate_or_strict', encoding='ascii') if fmt == 'gz': f_out = gzip.open(n_dest, 'wb') elif fmt == 'bz2': f_out = bz2.BZ2File(n_dest, 'wb') elif fmt == 'xz': f_out = lzma.LZMAFile(n_dest, 'wb') else: raise OSError("Invalid format") shutil.copyfileobj(f_in, f_out) b_successes.append(b_path) except OSError as e: module.fail_json( path=to_native(b_path), dest=dest, msg='Unable to write to compressed file: %s' % to_native(e), exception=format_exc()) if arcfile: arcfile.close() if f_in: f_in.close() if f_out: f_out.close() # Rudimentary check: If size changed then file changed. Not perfect, but easy. if os.path.getsize(b_dest) != size: changed = True state = 'compress' if remove and not check_mode: try: os.remove(b_path) except OSError as e: module.fail_json(path=to_native(b_path), msg='Unable to remove source file: %s' % to_native(e), exception=format_exc()) params['path'] = b_dest file_args = module.load_file_common_arguments(params) if not check_mode: changed = module.set_fs_attributes_if_different(file_args, changed) module.exit_json( archived=[ to_native(p, errors='surrogate_or_strict') for p in b_successes ], dest=dest, changed=changed, state=state, arcroot=to_native(b_arcroot, errors='surrogate_or_strict'), missing=[ to_native(p, errors='surrogate_or_strict') for p in b_missing ], expanded_paths=[ to_native(p, errors='surrogate_or_strict') for p in b_expanded_paths ], expanded_exclude_paths=[ to_native(p, errors='surrogate_or_strict') for p in b_expanded_exclude_paths ], )
def xz_file(tmp_path_factory): filename = tmp_path_factory.mktemp("data") / "file.xz" data = bytes(FILE_CONTENT, "utf-8") with lzma.open(filename, "wb") as f: f.write(data) return filename
def download_pdb_isf( self, guid: str, age: int, pdb_name: str, progress_callback: constants.ProgressCallback = None) -> None: """Attempts to download the PDB file, convert it to an ISF file and save it to one of the symbol locations.""" # Check for writability filter_string = os.path.join(pdb_name, guid + "-" + str(age)) for path in symbols.__path__: # Store any temporary files created by downloading PDB files tmp_files = [] potential_output_filename = os.path.join( path, "windows", filter_string + ".json.xz") data_written = False try: os.makedirs(os.path.dirname(potential_output_filename), exist_ok=True) with lzma.open(potential_output_filename, "w") as of: # Once we haven't thrown an error, do the computation filename = pdbconv.PdbRetreiver().retreive_pdb( guid + str(age), file_name=pdb_name, progress_callback=progress_callback) if filename: tmp_files.append(filename) location = "file:" + request.pathname2url( tmp_files[-1]) json_output = pdbconv.PdbReader( self.context, location, progress_callback).get_json() of.write( bytes( json.dumps(json_output, indent=2, sort_keys=True), 'utf-8')) # After we've successfully written it out, record the fact so we don't clear it out data_written = True else: vollog.warning( "Symbol file could not be found on remote server" + (" " * 100)) break except PermissionError: vollog.warning( "Cannot write necessary symbol file, please check permissions on {}" .format(potential_output_filename)) continue finally: # If something else failed, removed the symbol file so we don't pick it up in the future if not data_written and os.path.exists( potential_output_filename): os.remove(potential_output_filename) # Clear out all the temporary file if we constructed one for filename in tmp_files: try: os.remove(filename) except PermissionError: vollog.warning( "Temporary file could not be removed: {}".format( filename)) else: vollog.warning( "Cannot write downloaded symbols, please add the appropriate symbols" " or add/modify a symbols directory that is writable")
def resolve_all_links_and_redirects(encoding='utf-8'): page_file_name = os.path.join(config.which_wiki, 'pages.lzma') link_file_name = os.path.join(config.which_wiki, 'pagelinks.lzma') redirect_file_name = os.path.join(config.which_wiki, 'redirects.lzma') page_id_to_title = {} page_title_to_id = {} resolved_ids = set() resolved_title_to_id = {} # the final result of this work with lzma.open(page_file_name) as page_file: pages = page_file.read().decode(encoding, errors='replace').split('\n') count = 0 for page in pages: tup = tuple(entry for entry in page.split('\t')) if len(tup) > 1: page_id_to_title[int(tup[0])] = tup[1] # gonna go from id to title # so that redirects go from their own title to the redirected id page_title_to_id[tup[1]] = int(tup[0]) if len(tup) > 3: if int(tup[3]) == 0: resolved_ids.add(int(tup[0])) resolved_title_to_id[tup[1]] = int(tup[0]) print ("Page file read in. Length: {}".format(len(page_id_to_title))) print ("Number of resolved IDs: {}".format(len(resolved_ids))) num_redirect_failures = 0 num_redirect_successes = 0 with lzma.open(redirect_file_name) as redirect_file: redirects = redirect_file.read().decode(encoding, errors='replace').split('\n') # count = 0 for redirect in redirects: tup = tuple(entry for entry in redirect.split('\t')) # file is actually organized as 'redirect page id, final page title' # that has to be unraveled # to do that, use previously read in pages to get the page title # and map the page_title_to_id title to the redirected title to redirected id # problem! there can be multiple levels of redirects. gah. if len(tup) > 1: # print (tup) try: original_page_title = page_id_to_title[int(tup[0])] except: continue # print (original_page_title) # have to follow the rabbit hole down to find the vast majority of redirects # of redirects. That means looking until there's a hit in the title_to_id set try: redirect_count = 0 current_title = tup[1] current_id = int(page_title_to_id[current_title]) while current_id not in resolved_ids and redirect_count < 50: # atrociously large, in case there are cycles current_title = page_id_to_title[current_id] current_id = int(page_title_to_id[current_title]) redirect_count += 1 if original_page_title not in resolved_title_to_id: resolved_title_to_id[original_page_title] = current_id else: if current_id != resolved_title_to_id[original_page_title]: print ("Already resolved {} to {} not {}".format(tup[1], resolved_title_to_id[original_page_title], current_id)) num_redirect_successes += 1 except Exception as e: num_redirect_failures += 1 old_print_statements = ''' print ("Tup {} broke after {} redirect disentanglements with exception {}".format(tup, redirect_count, sys.exc_info()[0])) print ("Current title {} and current id {}".format(tup[1], current_id)) try: print("Trying to print the page title {}".format(page_id_to_title[int(tup[0])])) print("Could try {} as the final id".format(page_title_to_id[tup[1]])) print("That id produces article tuple {}".format(all_articles[page_title_to_id[tup[1]]])) print("Trying to print final id {}".format(title_to_id[tup[1]])) except: pass ''' print ("Redirects complete. {} succeeded, {} failed.".format(num_redirect_successes, num_redirect_failures)) print ("Total resolved titles: {} vs total titles {}.".format(len(resolved_title_to_id), len(page_title_to_id))) # first, we go through the links, and build up a set of titles that they are linked to # then, we go through the page set and find the ids for those titles. # if the seed set size + the linked size is contained in the target size, expand the # seed set and repeat link_map = {} max_chunk_size = 1000000 if config.testing else 1000000000 decompressor = lzma.LZMADecompressor() with open(link_file_name, 'rb') as link_file: data = link_file.read() # now, decompress a chunk at a time # if the line doesn't end in a carriage return, keep that last bit for the next line links = decompress_chunk(decompressor, data, encoding, max_chunk_size) count = 1 link_line_failures = 0 link_line_successes = 0 while not decompressor.needs_input: print("reading decompressed lines {}".format(count)) count += 1 if config.testing and count > 2: break if not links: break for link_line in links: # do I need to worry about incomplete lines? tup = tuple(entry for entry in link_line.split('\t')) if len(tup) > 1: id_from_title = None try: id_from_title = int(resolved_title_to_id[tup[1]]) except: if config.testing: print ("tup {} from link_line {} broke somehow.".format(tup, link_line)) # print ("page id {}".format(page_title_to_id[tup[1]])) link_line_failures += 1 if id_from_title: # two try/excepts in case id_from_title is busted try: link_map[int(tup[0])] += [id_from_title] except: link_map[int(tup[0])] = [id_from_title] link_line_successes += 1 links = decompress_chunk(decompressor, decompressor.unused_data, encoding, max_chunk_size) print ("Link Files have been imported. Length of links: {}".format(len(link_map))) print ("Link line successes: {} failures: {}".format(link_line_successes, link_line_failures)) return link_map