def plot(): df = generate_data() plt.figure(figsize=(6, 3)) sns.lineplot(data=df[["Lucloelium a", "Vorpal Pick +1 a"]], palette="gray") xticks = plt.xticks()[0] xticks_tr = [convert_size(k, "") if k >= 0 else "" for k in xticks] xticks_tr[-1] = "" plt.xticks(xticks, labels=xticks_tr) plt.savefig("./plots/coeffs.pdf", bbox_inches='tight', pad_inches=0) plt.cla() plt.figure(figsize=(6, 3)) sns.lineplot(data=df[["Lucloelium", "Vorpal Pick +1"]], palette="gray") xticks_tr = [convert_size(k, "") if k >= 0 else "" for k in xticks] xticks_tr[-1] = "" yticks = plt.yticks()[0] yticks_tr = [convert_size_round(k, "") if k >= 0 else "" for k in yticks] yticks_tr[-1] = "" plt.yticks(yticks, labels=yticks_tr) plt.savefig("./plots/units.pdf", bbox_inches='tight', pad_inches=0) plt.cla()
def udp_server_single_node(lim='10MB'): UDP_IP = '127.0.0.1' UDP_PORT = 5005 sock = socket.socket( socket.AF_INET, socket.SOCK_DGRAM ) sock.bind((UDP_IP, UDP_PORT)) cnt = 0 running_bytes = 0 server_start = dt.datetime.now() while True: signal.signal(signal.SIGINT, signal.default_int_handler) try: data, addr = sock.recvfrom(65535) running_bytes += len(data) cnt += 1 row_data = ( (dt.datetime.now() - server_start), convert_size(running_bytes), cnt # "{:,}".format(cnt) ) print(row_print('UDP', 'single', 'single node',row_data)) if data == b'END': dump_to_csv('server', 'UDP', 'localhost', 'single', 'single node', row_data) sys.exit() except KeyboardInterrupt: dump_to_csv('server', 'UDP', 'localhost', 'single', 'single node', row_data) sys.exit()
def udp_client_single_node(lim='10 MB', rounds=0): UDP_IP = '127.0.0.1' UDP_PORT = 5005 sock = socket.socket( socket.AF_INET, socket.SOCK_DGRAM ) cnt = 0 running_bytes = 0 client_start = dt.datetime.now() with open('data/emails.csv') as fd: for line in fd: if running_bytes <= parseSize(lim): # print('%s - %s bytes' % (line, utf8len(line))) line_bytes = line.encode('utf-8') sock.sendto(line_bytes, (UDP_IP, UDP_PORT)) else: sock.sendto('END'.encode('utf-8'), (UDP_IP, UDP_PORT)) dump_to_csv('client', 'UDP', rounds, 'localhost', 'single', 'single node', row_data) break cnt += 1 running_bytes += utf8len(line) row_data = ( (dt.datetime.now() - client_start), convert_size(running_bytes), cnt # "{:,}".format(cnt) ) print(row_print('UDP', 'single', 'single node',row_data))
def zeromq_client(lim='10 MB', rounds = 0): context = zmq.Context() publisher = context.socket(zmq.PUB) publisher.bind("tcp://*:5563") cnt = 0 running_bytes = 0 client_start = dt.datetime.now() with open('data/emails.csv') as fd: for line in fd: running_bytes += utf8len(line) cnt += 1 row_data = ( (dt.datetime.now() - client_start), convert_size(running_bytes), cnt # "{:,}".format(cnt) ) if running_bytes <= parseSize(lim): # print('%s - %s bytes' % (line, utf8len(line))) line_bytes = line.encode('utf-8') # envelope + content publisher.send_multipart([b'emails', line_bytes]) else: publisher.send_multipart([b'emails', 'END'.encode('utf-8')]) dump_to_csv('client', 'ZeroMQ', rounds, 'localhost', 'single', 'single node', row_data) # sys.exit() break print('ZeroMQ - single node - %s - %s - %s messages' % ( (dt.datetime.now() - client_start), convert_size(running_bytes), "{:,}".format(cnt) )) publisher.close() context.term()
def __str__(self): output = "Content:\n" output += " ID Index Type Size Hash\n" output += " {:08X} {:<7d} {:<14s} {:<11s}".format( self.cid, self.index, self.get_type(), utils.convert_size(self.size) ) output += binascii.hexlify(self.sha256).decode() + "\n" return output
def __init__(self, item, thread_num, parent): threading.Thread.__init__(self, name=(thread_num+1)) self.nm = item["filename"] self.base64path = item["filepath"] self.size_bytes = item["filesize"] self.size_str = utils.convert_size(self.size_bytes) self.thread_num = thread_num self.fulldest = item["filedir"] self.prt = parent self.destpath = item["fullpath"] self.tmppath = ""
def display_all_image_info(self): col_fmt = "{0:40}{1:12}{2:14}{3:18}{4:14}" write_out( col_fmt.format("REPOSITORY", "TAG", "IMAGE ID", "CREATED", "SIZE")) for i in self.client.images.list(): for r in i["repoTags"]: rsplit = r.rindex(":") name = r[0:rsplit - 1] tag = r[rsplit + 1:] write_out( col_fmt.format(name, tag, i["id"][:12], stringTimeToHuman(i["created"]), convert_size(i["size"])))
def verbose(self): print('Name: {}'.format(self.name)) # print('Artist: {}'.format(self.artist.name)) print('Year: {}'.format(self.year)) print('Number: {} / {}'.format(self.number, self.album.number_of_tracks)) print('URL: {}'.format(self.url)) print('ID: {}'.format(self.id)) print('Size: {}'.format(convert_size(self.size))) print('Created: {}'.format(self.created_date)) print('Last Update: {}'.format(self.updated_date)) if self.live: status = 'Live' else: status = 'Missing' print('Status: {}'.format(status))
def zeromq_server(lim = '10MB', rounds = 0): context = zmq.Context() subscriber = context.socket(zmq.SUB) subscriber.connect("tcp://localhost:5563") subscriber.setsockopt(zmq.SUBSCRIBE, b"emails") cnt = 0 running_bytes = 0 server_start = dt.datetime.now() while True: # envelope + data [address, data] = subscriber.recv_multipart() running_bytes += len(data) cnt += 1 row_data = ( (dt.datetime.now() - server_start), convert_size(running_bytes), cnt # "{:,}".format(cnt) ) print(row_print('ZeroMQ', 'single', 'single node',row_data)) # print('ZeroMQ - single node - %s - %s - %s messages' % ( # (dt.datetime.now() - client_start), # convert_size(running_bytes), # "{:,}".format(cnt) # )) if data == b'END': dump_to_csv('server', 'ZeroMQ', rounds, 'localhost', 'single', 'single node', row_data) # sys.exit() break subscriber.close() context.term()
def main(titleid, titlever=None, spoofver=None, pack_as_cia=True, keepcontents=True, enc_titlekey=None, onlyticket=False, cdndir=False, base_url="http://nus.cdn.c.shop.nintendowifi.net/ccs/download"): if len(titleid) != 16: print("Title ID must be 16 characters long.") return try: int(titleid, 16) except ValueError: print("Title ID must be in hexadecimal.") return if onlyticket and not enc_titlekey: print( "Please specify an ecrypted titlekey (--key) for Ticket generation." ) return if enc_titlekey: if len(enc_titlekey) != 32: print("Encrypted title key must be 32 characters long.") return try: int(enc_titlekey, 16) except ValueError: print("Title key must be in hexadecimal.") return if not pack_as_cia and not keepcontents: print("Running with these settings would produce no output.") return titleid = titleid.lower() nus = CIAGEN.NUS(titleid, titlever, base=base_url) if onlyticket: print("Generating Ticket for Title {0} v{1}".format( titleid, "[Latest]" if titlever == None else titlever)) else: print("Downloading Title {0} v{1}".format( titleid, "[Latest]" if titlever == None else titlever)) # Download TMD print("* Downloading TMD...") try: tmd = nus.tmd except HTTPError: print("Title not on NUS!") return # Parse TMD print("* Parsing TMD...") total_size = 0 for content in tmd.contents: total_size += content.size print(" Title Version: {0}".format(tmd.hdr.titleversion)) print(" {0} Content{1}: {2}".format( len(tmd.contents), "s" if len(tmd.contents) > 1 else "", utils.convert_size(total_size))) if titlever == None: titlever = tmd.hdr.titleversion else: if titlever != tmd.hdr.titleversion: print("WARNING: Title version should be {0} but is {1}".format( titleid, tmd.hdr.titleversion)) if titleid != tmd.get_titleid(): print("WARNING: Title ID should be {0} but is {1}".format( titleid, tmd.get_titleid())) if spoofver != None: tmd.hdr.titleversion = spoofver if cdndir: titlepath = os.path.join("titles", titleid) else: titlepath = os.path.join("titles", titleid, str(titlever)) if not os.path.isdir(titlepath): os.makedirs(titlepath) if not onlyticket: if cdndir: tmd.dump(os.path.join(titlepath, "tmd.{0}".format(titlever))) else: tmd.dump(os.path.join(titlepath, "tmd")) # Download Ticket if enc_titlekey: print("* Generating Ticket...") cetk = CIAGEN.Ticket(tickettemplate + magic) cetk.hdr.titleid = tmd.hdr.titleid cetk.hdr.titleversion = tmd.hdr.titleversion cetk.hdr.titlekey = binascii.a2b_hex(enc_titlekey) cetk.dump(os.path.join(titlepath, "cetk")) if onlyticket: print("Finished.") return else: print("* Downloading Ticket...") cetk = nus.ticket if not cetk: if pack_as_cia: print(" Ticket unavailable, can't be packed.") pack_as_cia = False else: print(" Ticket unavailable.") else: cetk.dump(os.path.join(titlepath, "cetk")) # Download Contents print("* Downloading Contents...") for i, content_url in enumerate(nus.get_content_urls()): print(" Content #{0} of #{1}: {2} ({3})".format( i + 1, tmd.hdr.contentcount, tmd.contents[i].get_cid(), utils.convert_size(tmd.contents[i].size))) content_path = os.path.join(titlepath, tmd.contents[i].get_cid()) req = get(content_url, stream=True) if req.status_code != 200: print( " Failed to download content: Is the title still on the NUS?" ) return with open(content_path, 'wb') as content_file: for chunk in req.iter_content( chunk_size=5242880): # Read in 5 MB chunks if chunk: content_file.write(chunk) if os.path.getsize(content_path) != tmd.contents[i].size: print(" Content size mismatch. Abort...") return # Pack as CIA if pack_as_cia: print("* Creating CIA...") cia_path = os.path.join( titlepath, "{0}-v{1}{2}.cia".format( titleid, titlever, "" if spoofver == None else ("-fakev" + str(spoofver)))) if cdndir: CIAGEN.CIAMaker(titlepath, titlever=titlever).dump(cia_path) else: CIAGEN.CIAMaker(titlepath).dump(cia_path) if not os.path.isfile(cia_path): print(" CIA creation failed.") else: print(" CIA creation successful: {0}".format(cia_path)) else: print("Finished.") if not keepcontents: if cdndir: os.remove(os.path.join(titlepath, "tmd.{0}".format(titlever))) else: os.remove(os.path.join(titlepath, "tmd")) try: os.remove(os.path.join(titlepath, "cetk")) except FileNotFoundError: pass for content in tmd.contents: os.remove(os.path.join(titlepath, content.get_cid()))
index_file = open(filename, "w") json.dump(index, index_file) index_file.close() if __name__ == "__main__": index = {} products = json.load(open("search_dataset.json")) index_filename = "index.json" index["products_by_id"] = utils.call_with_monitor_1( build_index_of_products_by_id, products) index["bows_by_id"] = utils.call_with_monitor_1( build_index_of_bag_of_words_by_id, index["products_by_id"]) index["ids_by_brand"] = utils.call_with_monitor_1( build_index_of_ids_by_brand, index["products_by_id"]) index["ids_by_term"] = utils.call_with_monitor_1( build_index_of_ids_by_term, index["bows_by_id"]) index["ids_by_collocation"] = utils.call_with_monitor_2( build_index_of_ids_by_collocation, index["ids_by_term"], index["bows_by_id"]) utils.call_with_monitor_2(dump_index, index, index_filename) print("{:40}".format(index_filename), end='', flush=True) print(utils.convert_size(os.stat(index_filename).st_size))
def __str__(self): output = "TMD:\n" output += " Title ID: {0}\n".format(self.get_titleid()) output += " Title Version: {0}\n".format(self.hdr.titleversion) output += "\n" output += " Number of contents: {0}\n".format(self.hdr.contentcount) output += " Contents:\n" output += " ID Index Type Size Hash\n" for content in self.contents: output += " {:08X} {:<7d} {:<14s} {:<11s}".format( content.cid, content.index, content.get_type(), utils.convert_size(content.size) ) output += binascii.hexlify(content.sha256).decode() + "\n" # TODO: Improve this, is a bit complicated to understand and duplicated if self.certificates: output += "\n" output += " Certificates:\n" try: signs_tmd = self.get_cert_by_name(self.get_issuer()[-1]) # CP signs_cp = self.get_cert_by_name(self.get_issuer()[1]) # CA except ValueError: output += " Could not locate the needed certificates.\n" return output try: signs_ca = self.get_cert_by_name(self.get_issuer()[0]) # Root except ValueError: signs_ca = None # Check TMD signature verified = utils.Crypto.verify_signature( self.certificates[signs_tmd], self.signature_pack(), self.signature ) sha256hash = utils.Crypto.create_sha256hash_hex(self.signature_pack()) output += " TMD signed by {0} using {1}:\n {2} ".format( "-".join(self.get_issuer()), self.certificates[signs_tmd].get_key_type(), sha256hash ) if verified: output += "[OK]" else: output += "[FAIL]" output += "\n" # Check CP signature verified = utils.Crypto.verify_signature( self.certificates[signs_cp], self.certificates[signs_tmd].signature_pack(), self.certificates[signs_tmd].signature ) sha256hash = utils.Crypto.create_sha256hash_hex(self.certificates[signs_tmd].signature_pack()) output += " {0} ({1}) signed by {2} ({3}):\n {4} ".format( self.certificates[signs_tmd].get_name(), self.certificates[signs_tmd].get_key_type(), self.certificates[signs_tmd].get_issuer(), self.certificates[signs_cp].get_key_type(), sha256hash ) if verified: output += "[OK]" else: output += "[FAIL]" output += "\n" # Check Root signature if signs_ca: verified = utils.Crypto.verify_signature( signs_ca, self.certificates[signs_cp].signature_pack(), self.certificates[signs_cp].signature ) sha256hash = utils.Crypto.create_sha256hash_hex(self.certificates[signs_cp].signature_pack()) output += " {0} ({1}) signed by {2} ({3}):\n {4} ".format( self.certificates[signs_cp].get_name(), self.certificates[signs_cp].get_key_type(), self.certificates[signs_cp].get_issuer(), ROOT_KEY.get_key_type(), sha256hash ) if verified: output += "[OK]" else: output += "[FAIL]" else: output += " {0} ({1}) signed by {2}: Please place root-key in the same directory".format( self.certificates[signs_cp].get_name(), self.certificates[signs_cp].get_key_type(), self.certificates[signs_cp].get_issuer() ) output += "\n" return output
def run(self): sizecopied = 0 if self.prt.tmp: fulltmp = self.prt.tmp else: fulltmp = self.fulldest params = {"access_token":self.prt.accesstoken, "path":self.base64path} log.info("%s size %s", self.nm, self.size_str) try: apidownloaduri = "%s%s?%s" % (BASE_URL, urllib.quote_plus(self.nm), urllib.urlencode(params)) except KeyError: self.cleanUpAfterError("Error unsupported characters in filename %s" % self.nm, self.destpath) except: #This technically should never happen but technically you never know self.cleanUpAfterError(traceback.format_exc(), self.destpath) if not os.path.exists(self.fulldest) or (self.prt.tmp and not os.path.exists(fulltmp)): self.cleanUpAfterError("Missing temp or destination parent directory", self.destpath) self.tmppath = os.path.join(fulltmp, self.nm) if self.prt.tmp: filehash = sha1("blob " + str(self.size_bytes) + "\0" + self.tmppath) tmpname = filehash.hexdigest() self.tmppath = os.path.join(self.prt.tmp, tmpname) log.debug("Downloading file to %s", self.tmppath) retriesleft = 3 sizemismatched = False failsize = False while retriesleft > 0: sizecopied = 0 progress = time.time() + 60 apiratecount = 0 try: with open(self.tmppath, 'wb') as tmpfile: st = time.time() timespan = 0 req = requests.get(apidownloaduri, stream=True, timeout=120) chunk_size = 1024 #if not sizemismatched: chunk_size += 1024 * 1024 for chunk in req.iter_content(chunk_size=chunk_size): sizecopied += len(chunk) if self.prt.end: break if chunk: # filter out keep-alive new chunks tmpfile.write(chunk) if self.prt.progress and progress < time.time(): progress = time.time() + 60 speed = utils.get_speed(sizecopied, (time.time()-st)) log.info("%s\nDownloaded %s of %s at %s", self.nm, utils.convert_size(sizecopied), self.size_str, speed) #if sizemismatched: # tmpfile.flush() # os.fsync(tmpfile.fileno()) timespan = (time.time()-st) if sizecopied != self.size_bytes and not self.prt.end: raise SizeMismatchError("Download size mismatch downloaded %s expected %s" % (sizecopied, self.size_bytes)) elif self.prt.end: self.cleanUpAfterError("Recieved signaled stop during download", self.destpath) except (requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.RequestException): retriesleft -= 1 if req.status_code == 429: apiratecount += 1 retriesleft += 1 log.warn("API rate limit reached. Will retry") else: log.warn("Network error. Will retry %s more times", retriesleft) if retriesleft > 0: time.sleep(10 * apiratecount) else: log.error("Error downloading %s", self.nm) self.cleanUpAfterError("Maximum retries reached", self.destpath) except SizeMismatchError: retriesleft -= 1 log.exception("%s File size mismatch. Will retry %s more times", self.nm, retriesleft) sizemismatched = True if retriesleft == 2: failsize = sizecopied elif failsize and failsize != sizecopied: failsize = False if retriesleft > 0: time.sleep(10) elif failsize: log.warn("%s\nRecieved incorrect file size %s instead of %s 3 times. Saving anyway", self.nm, sizecopied, self.size_bytes) else: log.error("Error downloading %s", self.nm) self.cleanUpAfterError("Maximum retries reached", self.destpath) except (IOError, OSError, WindowsError): log.exception("Error writing to file %s", self.nm) self.cleanUpAfterError(traceback.format_exc(), self.destpath) except (requests.exceptions.RequestException, Exception): log.error("Error downloading %s", self.nm) self.cleanUpAfterError(traceback.format_exc(), self.destpath) except SystemExit: self.cleanUpAfterError("Received signal exit", self.destpath) raise except: if req.status_code in [429, 503]: apiratecount += 1 retriesleft += 1 log.warn("API rate limit reached. Will retry") else: log.exception("Error downloading %s. Will retry %s more times", self.nm, retriesleft) if retriesleft > 0: time.sleep(10 * apiratecount) else: self.cleanUpAfterError("An unknown error occured", self.destpath) else: retriesleft = 0 self.prt.downloadtime += timespan if self.prt.progress: speed = utils.get_speed(self.size_bytes, timespan) log.info("%s downloaded at %s", self.size_str, speed) if self.prt.end: log.warn("Parent signaled stop") return self.prt.bytestotal += self.size_bytes if self.prt.tmp: log.info("Copying from temp to dest") retriesleft = 3 while retriesleft > 0: try: st = time.time() timespan = 0 with open(self.tmppath, 'rb') as f, open(self.destpath, "wb") as fo: while True and not self.prt.end: piece = f.read(1024) if piece: fo.write(piece) else: break timespan = (time.time()-st) if self.prt.end: self.cleanUpAfterError("Recieved signaled stop during copy", self.destpath) except (IOError, OSError, WindowsError) as e: retriesleft -= 1 if retriesleft > 0: self.delete_dest() log.exception("Error copying file wil retry %s more times", retriesleft) else: log.exception("Error file could not be copied to %s", self.destpath) self.cleanUpAfterError(traceback.format_exc(), self.destpath) except SystemExit: self.cleanUpAfterError("Received signal exit", self.destpath) raise except: #This technically should never happen but technically you never know retriesleft -= 1 if retriesleft > 0: self.delete_dest() log.exception("Error copying file wil retry %s more times", retriesleft) else: log.exception("Error file could not be copied to %s", self.destpath) self.cleanUpAfterError(traceback.format_exc(), self.destpath) else: retriesleft = 0 self.prt.copytime += timespan if self.prt.progress: speed = utils.get_speed(self.size_bytes, timespan) log.info("%s copied at %s", self.size_str, speed) try: os.remove(self.tmppath) except (IOError, OSError) as e: log.warn("Failed cleaning up tmp file %s\n%s", self.tmppath, e) self.prt.writeSuccess(self.destpath) log.info("Finished download %s in ", self.destpath) log.debug("End of thread") self.prt.threads[self.thread_num] = None
def main(titleid, titlever=None, pack_as_wad=True, decryptcontents=False, localuse=True, keepcontents=True, enc_titlekey=None, onlyticket=False, cdndir=False, base_url="http://nus.cdn.shop.wii.com/ccs/download"): if len(titleid) != 16: print("Title ID must be 16 characters long.") return try: int(titleid, 16) except ValueError: print("Title ID must be in hexadecimal.") return if onlyticket and not enc_titlekey: print( "Please specify an ecrypted titlekey (--key) for Ticket generation." ) return if enc_titlekey: if len(enc_titlekey) != 32: print("Encrypted title key must be 32 characters long.") return try: int(enc_titlekey, 16) except ValueError: print("Title key must be in hexadecimal.") return if not pack_as_wad and not keepcontents and not decryptcontents: print("Running with these settings would produce no output.") return titleid = titleid.lower() nus = WADGEN.NUS(titleid, titlever, base=base_url) if onlyticket: print("Generating Ticket for Title {0} v{1}".format( titleid, "[Latest]" if titlever == None else titlever)) else: print("Downloading Title {0} v{1}".format( titleid, "[Latest]" if titlever == None else titlever)) # Download TMD print("* Downloading TMD...") try: tmd = nus.tmd except HTTPError: print("Title not on NUS!") return # Parse TMD print("* Parsing TMD...") total_size = 0 for content in tmd.contents: total_size += content.size print(" Title Version: {0}".format(tmd.hdr.titleversion)) print(" {0} Content{1}: {2}".format( len(tmd.contents), "s" if len(tmd.contents) > 1 else "", utils.convert_size(total_size))) if titlever == None: titlever = tmd.hdr.titleversion else: if titlever != tmd.hdr.titleversion: print(" WARNING: Title version should be {0} but is {1}".format( titleid, tmd.hdr.titleversion)) if titleid != tmd.get_titleid(): print( " WARNING: Title ID should be {0} but is {1} (ignore for vWii)". format(titleid, tmd.get_titleid())) if cdndir: titlepath = os.path.join("titles", titleid) else: titlepath = os.path.join("titles", titleid, str(titlever)) if not os.path.isdir(titlepath): os.makedirs(titlepath) if not onlyticket: if cdndir: tmd.dump(os.path.join(titlepath, "tmd.{0}".format(titlever))) else: tmd.dump(os.path.join(titlepath, "tmd")) # Download Ticket if enc_titlekey: # TODO: Generate DSi tickets print("* Generating Ticket...") cetk = WADGEN.Ticket(tickettemplate) cetk.hdr.titleid = tmd.hdr.titleid cetk.hdr.titleversion = tmd.hdr.titleversion cetk.hdr.titlekey = binascii.a2b_hex(enc_titlekey) if tmd.get_region() == "Korea" and not tmd.get_titleid().startswith( "00030"): # Korea + not DSi cetk.hdr.ckeyindex = 1 # Korean common-key index elif titleid.startswith("00000007") or titleid.startswith( "0007"): # Wii U Wii Mode cetk.hdr.ckeyindex = 2 # vWii common-key index cetk.dump(os.path.join(titlepath, "cetk")) if localuse: # We need to set Title IV and decrypt the titlekey for verifying cetk.titleiv = struct.pack(">Q", cetk.hdr.titleid) + b"\x00" * 8 cetk.decrypted_titlekey = utils.Crypto.decrypt_titlekey( commonkey=cetk.get_decryption_key(), iv=cetk.titleiv, titlekey=cetk.hdr.titlekey) if onlyticket: print("Finished.") return else: print("* Downloading Ticket...") cetk = nus.ticket if not cetk: if pack_as_wad: print(" Ticket unavailable, can't pack nor verify.") pack_as_wad = False else: print(" Ticket unavailable, can't verify download.") else: cetk.dump(os.path.join(titlepath, "cetk")) if decryptcontents and not keepcontents and not cetk: print( "Aborted, because contents should be deleted and decrypting is not possible." ) return # Download Contents print("* Downloading Contents...") for i, content_url in enumerate(nus.get_content_urls()): print(" Content #{0} of #{1}: {2} ({3})".format( i + 1, tmd.hdr.contentcount, tmd.contents[i].get_cid(), utils.convert_size(tmd.contents[i].size))) content_path = os.path.join(titlepath, tmd.contents[i].get_cid()) # Local Use if localuse and cetk: if os.path.isfile(content_path): with open(content_path, "rb") as content_file: valid, decdata = utils.Crypto.check_content_hash( tmd.contents[i], cetk, content_file.read(), return_decdata=True) if valid: print(" Content exists and has been verified!") if decryptcontents: print(" Decrypting...") with open(content_path + ".app", "wb") as decrypted_content_file: decrypted_content_file.write(decdata) continue # Go on with the next content else: print( " Content exists, but hash check failed - redownloading..." ) # Start content download by sending GET request (in stream mode) content_data = b'' with http_session.get(content_url, timeout=30, stream=True) as req: # Check status code and content length element content_total_length = req.headers.get('content-length') if req.status_code != 200 or content_total_length is None: print(" Failed to download content") return # Download data in chunks progress = 0 content_total_length = int(content_total_length) content_total_length_str = utils.convert_size(content_total_length) for chunk in req.iter_content(chunk_size=(1024 * 256)): if not chunk: break content_data += chunk progress += len(chunk) percentage = int(progress * 100 / content_total_length) bar_length = int(percentage / 2) sys.stdout.write("\r%s" % (' ' * 100)) sys.stdout.flush() if progress < content_total_length: sys.stdout.write( "\r %u%% [%s%s] %s / %s" % (percentage, '=' * bar_length, ' ' * (50 - bar_length), utils.convert_size(progress), content_total_length_str)) else: sys.stdout.write("\r 100%% [%s] %s / %s" % ('=' * 50, content_total_length_str, content_total_length_str)) sys.stdout.flush() sys.stdout.write("\r%s\r" % (' ' * 100)) sys.stdout.flush() # Verify after download if cetk: valid, decdata = utils.Crypto.check_content_hash( tmd.contents[i], cetk, content_data, return_decdata=True) if not valid: print(" Hash check failed.") return if decryptcontents: print(" Decrypting...") with open(content_path + ".app", "wb") as decrypted_content_file: decrypted_content_file.write(decdata) with open(content_path, 'wb') as content_file: content_file.write(content_data) # Pack as WAD if pack_as_wad: if not cetk.get_titleid().startswith("00030"): print("* Creating WAD...") wad_path = os.path.join(titlepath, "{0}-v{1}.wad".format(titleid, titlever)) if cdndir: WADGEN.WADMaker(titlepath, titlever=titlever).dump(wad_path) else: WADGEN.WADMaker(titlepath).dump(wad_path) if not os.path.isfile(wad_path): print(" WAD creation failed.") else: print(" WAD creation successful: {0}".format(wad_path)) else: print("Finished.") if not keepcontents: if cdndir: os.remove(os.path.join(titlepath, "tmd.{0}".format(titlever))) else: os.remove(os.path.join(titlepath, "tmd")) try: os.remove(os.path.join(titlepath, "cetk")) except FileNotFoundError: pass for content in tmd.contents: os.remove(os.path.join(titlepath, content.get_cid()))
def main(): parser = argparse.ArgumentParser( description= 'PyTorch Variational Autoencoders for Collaborative Filtering') parser.add_argument( '--json_file', type=str, default='C:\\Users\iyeshuru\Downloads\dblp_papers_v11.txt', help='Processed input h5 file.') parser.add_argument('--limit', type=int, default=None, help='Limit number of data to process.') parser.add_argument('--release', action='store_true', help='Build only training set (no validation/test)') args = parser.parse_args() # json_file = 'C:\\Users\iyeshuru\PycharmProjects\PapersProject\\flow\dblp.cut' # json_file = 'C:\\Users\iyeshuru\PycharmProjects\PapersProject\\flow\dblp.large.cut' # json_file = 'C:\\Users\iyeshuru\Downloads\dblp_papers_v11.txt' json_file = 'C:\\Users\iyeshuru\PycharmProjects\PapersProject\\flow\dblp_test.txt' # json_file = args.json_file args.release = True args.limit = 1000 DATA_DIR = 'data/' # index2paper if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) raw_output = os.path.join(DATA_DIR, 'raw_output.h5') processed_output_file = os.path.join(DATA_DIR, 'processed_output.h5') embeddings_output_file = os.path.join(DATA_DIR, 'embeddings_output.h5') author2idx_pickle = os.path.join(DATA_DIR, 'author2idx.pickle') idx2author_pickle = os.path.join(DATA_DIR, 'idx2author.pickle') paper2embedding_idx_pickle = os.path.join(DATA_DIR, 'paper2embedding_idx.pickle') # Process data create_paper_author_score_triples(json_file, raw_output, args.limit) paper2idxs = {} # Create paper id to indexes (group) warnings.warn("Loading all paper IDs into memory.") with h5py.File(raw_output, 'r') as f: for i, (paper, _, _) in tqdm(enumerate(f['paper_author_score']), total=len(f['paper_author_score']), desc='Building paper2idxs mapping'): if paper not in paper2idxs: paper2idxs[paper] = [] paper2idxs[paper].append(i) unique_papers_count = len(paper2idxs.keys()) n_heldout_users = int(unique_papers_count * 0.2) # Split Train/Validation/Test User Indices if args.release: print("Release: building only train set.") tr_papers_index_range = [0, unique_papers_count] vd_papers_index_range = [0, 0] te_papers_index_range = [0, 0] else: tr_papers_index_range = [0, unique_papers_count - n_heldout_users * 2] vd_papers_index_range = [ unique_papers_count - n_heldout_users * 2, unique_papers_count - n_heldout_users ] te_papers_index_range = [ unique_papers_count - n_heldout_users, unique_papers_count ] for dataset, index_range in zip(["Train", "Validation", "Test"], [ tr_papers_index_range, vd_papers_index_range, te_papers_index_range, ]): print("{} papers: {}".format(dataset, index_range[1] - index_range[0])) warnings.warn("Loading all paper indexes into memory.") ranges = [ tr_papers_index_range, vd_papers_index_range, te_papers_index_range ] tr_indexes = [ item for sublist in list(paper2idxs.values()) [tr_papers_index_range[0]:tr_papers_index_range[1]] for item in sublist ] unique_train_authors = unique(raw_output, 1, tr_indexes) unique_train_authors_count = len(unique_train_authors) author2idx = dict((pid, i) for (i, pid) in enumerate(unique_train_authors)) with h5py.File(raw_output, 'r') as raw_f, h5py.File(processed_output_file, 'w') as processed_f: ################## Add Paper, author, scores data ############################## paper_author_score_ds = raw_f['paper_author_score'] # Save mapping to ds with open(author2idx_pickle, 'wb') as handle: pickle.dump(author2idx, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Saved {} (size: {})".format( author2idx_pickle, convert_size(os.path.getsize(author2idx_pickle)))) # Save reverse mapping to ds (used in inference) idx2author = {} for author, idx in tqdm(author2idx.items(), total=len(author2idx), desc='Saving reverse mapping'): idx2author[idx] = author with open(idx2author_pickle, 'wb') as handle: pickle.dump(idx2author, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Saved {} (size: {})".format( idx2author_pickle, convert_size(os.path.getsize(idx2author_pickle)))) del idx2author gc.collect() train_grp = processed_f.create_group('train') validation_grp = processed_f.create_group('validation') test_grp = processed_f.create_group('test') # Creating datasets and buffers for all train/validation/test ->train/test combinations list_pairs = list( map( lambda grp: [ DatasetBuffer( grp.create_dataset( 'train', maxshape=paper_author_score_ds.shape, shape=(0, paper_author_score_ds.shape[1]), chunks=paper_author_score_ds.chunks, dtype=paper_author_score_ds.dtype)), DatasetBuffer( grp.create_dataset( 'test', maxshape=paper_author_score_ds.shape, shape=(0, paper_author_score_ds.shape[1]), chunks=paper_author_score_ds.chunks, dtype=paper_author_score_ds.dtype)) ], [train_grp, validation_grp, test_grp])) buffers = [item for sublist in list_pairs for item in sublist] # tr_tr, tr_te, val_tr, val_te, test_te, test_tr = buffers # Filter by author, group by paper and split by proportion. for i, ((tr_tr, tr_te), index_range) in enumerate(zip(list_pairs, ranges)): print("Building dataset: {}".format( ['Train', 'Validation', 'Test'][i])) # indexes = [item for sublist in list(paper2idxs.items())[index_range[0]:index_range[1]] for item in sublist] add_data(paper_author_score_ds, tr_tr, tr_te, index_range, paper2idxs, author2idx, test_proportion=None if i == 0 else 0.2) for buffer in buffers: buffer.close() ########## Add title embeddings ####################### # TODO: Pass last two args differnetly... from get_embeddings import collect_embeddings # collect_embeddings(json_file, embeddings_output_file, args.limit, CHUNK_SIZE, skip_paper) collect_embeddings(paper2embedding_idx_pickle, json_file, unique_papers_count, embeddings_output_file, args.limit, CHUNK_SIZE, skip_paper) print("Done!")