def plot():
    df = generate_data()

    plt.figure(figsize=(6, 3))
    sns.lineplot(data=df[["Lucloelium a", "Vorpal Pick +1 a"]], palette="gray")
    xticks = plt.xticks()[0]

    xticks_tr = [convert_size(k, "") if k >= 0 else "" for k in xticks]
    xticks_tr[-1] = ""

    plt.xticks(xticks, labels=xticks_tr)
    plt.savefig("./plots/coeffs.pdf", bbox_inches='tight', pad_inches=0)
    plt.cla()

    plt.figure(figsize=(6, 3))

    sns.lineplot(data=df[["Lucloelium", "Vorpal Pick +1"]], palette="gray")

    xticks_tr = [convert_size(k, "") if k >= 0 else "" for k in xticks]
    xticks_tr[-1] = ""

    yticks = plt.yticks()[0]
    yticks_tr = [convert_size_round(k, "") if k >= 0 else "" for k in yticks]
    yticks_tr[-1] = ""

    plt.yticks(yticks, labels=yticks_tr)

    plt.savefig("./plots/units.pdf", bbox_inches='tight', pad_inches=0)
    plt.cla()
Esempio n. 2
0
def udp_server_single_node(lim='10MB'):
	UDP_IP = '127.0.0.1'
	UDP_PORT = 5005

	sock = socket.socket(
		socket.AF_INET,
		socket.SOCK_DGRAM
	)
	sock.bind((UDP_IP, UDP_PORT))

	cnt = 0
	running_bytes = 0
	server_start = dt.datetime.now()

	while True:
		signal.signal(signal.SIGINT, signal.default_int_handler)
		try:
			data, addr = sock.recvfrom(65535)
			running_bytes += len(data)
			cnt += 1

			row_data = (
				(dt.datetime.now() - server_start),
				convert_size(running_bytes),
				cnt # "{:,}".format(cnt)
			)
			print(row_print('UDP', 'single', 'single node',row_data))

			if data == b'END':
				dump_to_csv('server', 'UDP', 'localhost', 'single', 'single node', row_data)
				sys.exit()

		except KeyboardInterrupt:
			dump_to_csv('server', 'UDP', 'localhost', 'single', 'single node', row_data)
			sys.exit()
Esempio n. 3
0
def udp_client_single_node(lim='10 MB', rounds=0):
	UDP_IP = '127.0.0.1'
	UDP_PORT = 5005

	sock = socket.socket(
		socket.AF_INET,
		socket.SOCK_DGRAM
	)

	cnt = 0
	running_bytes = 0
	client_start = dt.datetime.now()
	
	with open('data/emails.csv') as fd:
		for line in fd:
			if running_bytes <= parseSize(lim):
				# print('%s - %s bytes' % (line, utf8len(line)))
				line_bytes = line.encode('utf-8')
				sock.sendto(line_bytes, (UDP_IP, UDP_PORT))
			else:
				sock.sendto('END'.encode('utf-8'), (UDP_IP, UDP_PORT))

				dump_to_csv('client', 'UDP', rounds, 'localhost', 'single', 'single node', row_data)
				break

			cnt += 1
			running_bytes += utf8len(line)

			row_data = (
				(dt.datetime.now() - client_start),
				convert_size(running_bytes),
				cnt # "{:,}".format(cnt)
			)
			print(row_print('UDP', 'single', 'single node',row_data))
Esempio n. 4
0
def zeromq_client(lim='10 MB', rounds = 0):
	context   = zmq.Context()
	publisher = context.socket(zmq.PUB)
	publisher.bind("tcp://*:5563")


	cnt = 0
	running_bytes = 0
	client_start = dt.datetime.now()
	
	with open('data/emails.csv') as fd:
		for line in fd:
			running_bytes += utf8len(line)
			cnt += 1

			row_data = (
				(dt.datetime.now() - client_start),
				convert_size(running_bytes),
				cnt # "{:,}".format(cnt)
			)

			if running_bytes <= parseSize(lim):
				# print('%s - %s bytes' % (line, utf8len(line)))
				line_bytes = line.encode('utf-8')
				
				# envelope + content
				publisher.send_multipart([b'emails', line_bytes])
			else:
				publisher.send_multipart([b'emails', 'END'.encode('utf-8')])
				dump_to_csv('client', 'ZeroMQ', rounds, 'localhost', 'single', 'single node', row_data)
				# sys.exit()
				break

			print('ZeroMQ - single node - %s - %s - %s messages' % (
				(dt.datetime.now() - client_start),
				convert_size(running_bytes),
				"{:,}".format(cnt)
			))

	publisher.close()
	context.term()
Esempio n. 5
0
        def __str__(self):
            output = "Content:\n"
            output += "   ID         Index   Type           Size       Hash\n"
            output += "   {:08X}   {:<7d} {:<14s} {:<11s}".format(
                self.cid,
                self.index,
                self.get_type(),
                utils.convert_size(self.size)
            )
            output += binascii.hexlify(self.sha256).decode() + "\n"

            return output
    def __init__(self, item, thread_num, parent):
        threading.Thread.__init__(self, name=(thread_num+1))
        self.nm = item["filename"]
        self.base64path = item["filepath"]
        self.size_bytes = item["filesize"]
        self.size_str = utils.convert_size(self.size_bytes)
        self.thread_num = thread_num
        self.fulldest = item["filedir"]

        self.prt = parent
        self.destpath = item["fullpath"]
        self.tmppath = ""
Esempio n. 7
0
 def display_all_image_info(self):
     col_fmt = "{0:40}{1:12}{2:14}{3:18}{4:14}"
     write_out(
         col_fmt.format("REPOSITORY", "TAG", "IMAGE ID", "CREATED", "SIZE"))
     for i in self.client.images.list():
         for r in i["repoTags"]:
             rsplit = r.rindex(":")
             name = r[0:rsplit - 1]
             tag = r[rsplit + 1:]
             write_out(
                 col_fmt.format(name, tag, i["id"][:12],
                                stringTimeToHuman(i["created"]),
                                convert_size(i["size"])))
Esempio n. 8
0
 def verbose(self):
     print('Name: {}'.format(self.name))
     # print('Artist: {}'.format(self.artist.name))
     print('Year: {}'.format(self.year))
     print('Number: {} / {}'.format(self.number,
                                    self.album.number_of_tracks))
     print('URL: {}'.format(self.url))
     print('ID: {}'.format(self.id))
     print('Size: {}'.format(convert_size(self.size)))
     print('Created: {}'.format(self.created_date))
     print('Last Update: {}'.format(self.updated_date))
     if self.live:
         status = 'Live'
     else:
         status = 'Missing'
     print('Status: {}'.format(status))
Esempio n. 9
0
def zeromq_server(lim = '10MB', rounds = 0):
	context    = zmq.Context()
	subscriber = context.socket(zmq.SUB)
	subscriber.connect("tcp://localhost:5563")
	subscriber.setsockopt(zmq.SUBSCRIBE, b"emails")

	cnt = 0
	running_bytes = 0
	server_start = dt.datetime.now()

	while True:
		# envelope + data
		[address, data] = subscriber.recv_multipart()
		running_bytes += len(data)
		cnt += 1

		row_data = (
			(dt.datetime.now() - server_start),
			convert_size(running_bytes),
			cnt # "{:,}".format(cnt)
		)
		print(row_print('ZeroMQ', 'single', 'single node',row_data))

		# print('ZeroMQ - single node - %s - %s - %s messages' % (
		# 		(dt.datetime.now() - client_start),
		# 		convert_size(running_bytes),
		# 		"{:,}".format(cnt)
		# 	))

		if data == b'END':
			dump_to_csv('server', 'ZeroMQ', rounds, 'localhost', 'single', 'single node', row_data)
			# sys.exit()
			break

	subscriber.close()
	context.term()
Esempio n. 10
0
def main(titleid,
         titlever=None,
         spoofver=None,
         pack_as_cia=True,
         keepcontents=True,
         enc_titlekey=None,
         onlyticket=False,
         cdndir=False,
         base_url="http://nus.cdn.c.shop.nintendowifi.net/ccs/download"):
    if len(titleid) != 16:
        print("Title ID must be 16 characters long.")
        return
    try:
        int(titleid, 16)
    except ValueError:
        print("Title ID must be in hexadecimal.")
        return

    if onlyticket and not enc_titlekey:
        print(
            "Please specify an ecrypted titlekey (--key) for Ticket generation."
        )
        return

    if enc_titlekey:
        if len(enc_titlekey) != 32:
            print("Encrypted title key must be 32 characters long.")
            return
        try:
            int(enc_titlekey, 16)
        except ValueError:
            print("Title key must be in hexadecimal.")
            return

    if not pack_as_cia and not keepcontents:
        print("Running with these settings would produce no output.")
        return

    titleid = titleid.lower()
    nus = CIAGEN.NUS(titleid, titlever, base=base_url)

    if onlyticket:
        print("Generating Ticket for Title {0} v{1}".format(
            titleid, "[Latest]" if titlever == None else titlever))
    else:
        print("Downloading Title {0} v{1}".format(
            titleid, "[Latest]" if titlever == None else titlever))

    # Download TMD
    print("* Downloading TMD...")
    try:
        tmd = nus.tmd
    except HTTPError:
        print("Title not on NUS!")
        return

    # Parse TMD
    print("* Parsing TMD...")
    total_size = 0
    for content in tmd.contents:
        total_size += content.size
    print("    Title Version: {0}".format(tmd.hdr.titleversion))
    print("    {0} Content{1}: {2}".format(
        len(tmd.contents), "s" if len(tmd.contents) > 1 else "",
        utils.convert_size(total_size)))

    if titlever == None:
        titlever = tmd.hdr.titleversion
    else:
        if titlever != tmd.hdr.titleversion:
            print("WARNING: Title version should be {0} but is {1}".format(
                titleid, tmd.hdr.titleversion))

    if titleid != tmd.get_titleid():
        print("WARNING: Title ID should be {0} but is {1}".format(
            titleid, tmd.get_titleid()))

    if spoofver != None:
        tmd.hdr.titleversion = spoofver

    if cdndir:
        titlepath = os.path.join("titles", titleid)
    else:
        titlepath = os.path.join("titles", titleid, str(titlever))
    if not os.path.isdir(titlepath):
        os.makedirs(titlepath)
    if not onlyticket:
        if cdndir:
            tmd.dump(os.path.join(titlepath, "tmd.{0}".format(titlever)))
        else:
            tmd.dump(os.path.join(titlepath, "tmd"))

    # Download Ticket
    if enc_titlekey:
        print("* Generating Ticket...")
        cetk = CIAGEN.Ticket(tickettemplate + magic)
        cetk.hdr.titleid = tmd.hdr.titleid
        cetk.hdr.titleversion = tmd.hdr.titleversion
        cetk.hdr.titlekey = binascii.a2b_hex(enc_titlekey)
        cetk.dump(os.path.join(titlepath, "cetk"))
        if onlyticket:
            print("Finished.")
            return
    else:
        print("* Downloading Ticket...")
        cetk = nus.ticket
        if not cetk:
            if pack_as_cia:
                print("    Ticket unavailable, can't be packed.")
                pack_as_cia = False
            else:
                print("    Ticket unavailable.")
        else:
            cetk.dump(os.path.join(titlepath, "cetk"))

    # Download Contents
    print("* Downloading Contents...")
    for i, content_url in enumerate(nus.get_content_urls()):
        print("    Content #{0} of #{1}: {2} ({3})".format(
            i + 1, tmd.hdr.contentcount, tmd.contents[i].get_cid(),
            utils.convert_size(tmd.contents[i].size)))
        content_path = os.path.join(titlepath, tmd.contents[i].get_cid())
        req = get(content_url, stream=True)
        if req.status_code != 200:
            print(
                "		 Failed to download content: Is the title still on the NUS?"
            )
            return
        with open(content_path, 'wb') as content_file:
            for chunk in req.iter_content(
                    chunk_size=5242880):  # Read in 5 MB chunks
                if chunk:
                    content_file.write(chunk)

        if os.path.getsize(content_path) != tmd.contents[i].size:
            print("		 Content size mismatch. Abort...")
            return

    # Pack as CIA
    if pack_as_cia:
        print("* Creating CIA...")
        cia_path = os.path.join(
            titlepath, "{0}-v{1}{2}.cia".format(
                titleid, titlever, "" if spoofver == None else
                ("-fakev" + str(spoofver))))
        if cdndir:
            CIAGEN.CIAMaker(titlepath, titlever=titlever).dump(cia_path)
        else:
            CIAGEN.CIAMaker(titlepath).dump(cia_path)
        if not os.path.isfile(cia_path):
            print("    CIA creation failed.")
        else:
            print("    CIA creation successful: {0}".format(cia_path))
    else:
        print("Finished.")

    if not keepcontents:
        if cdndir:
            os.remove(os.path.join(titlepath, "tmd.{0}".format(titlever)))
        else:
            os.remove(os.path.join(titlepath, "tmd"))
        try:
            os.remove(os.path.join(titlepath, "cetk"))
        except FileNotFoundError:
            pass
        for content in tmd.contents:
            os.remove(os.path.join(titlepath, content.get_cid()))
Esempio n. 11
0
    index_file = open(filename, "w")
    json.dump(index, index_file)
    index_file.close()


if __name__ == "__main__":

    index = {}
    products = json.load(open("search_dataset.json"))
    index_filename = "index.json"

    index["products_by_id"] = utils.call_with_monitor_1(
        build_index_of_products_by_id, products)

    index["bows_by_id"] = utils.call_with_monitor_1(
        build_index_of_bag_of_words_by_id, index["products_by_id"])

    index["ids_by_brand"] = utils.call_with_monitor_1(
        build_index_of_ids_by_brand, index["products_by_id"])

    index["ids_by_term"] = utils.call_with_monitor_1(
        build_index_of_ids_by_term, index["bows_by_id"])

    index["ids_by_collocation"] = utils.call_with_monitor_2(
        build_index_of_ids_by_collocation, index["ids_by_term"],
        index["bows_by_id"])

    utils.call_with_monitor_2(dump_index, index, index_filename)
    print("{:40}".format(index_filename), end='', flush=True)
    print(utils.convert_size(os.stat(index_filename).st_size))
Esempio n. 12
0
    def __str__(self):
        output = "TMD:\n"
        output += "  Title ID: {0}\n".format(self.get_titleid())
        output += "  Title Version: {0}\n".format(self.hdr.titleversion)
        output += "\n"
        output += "  Number of contents: {0}\n".format(self.hdr.contentcount)
        output += "  Contents:\n"
        output += "   ID         Index   Type           Size       Hash\n"
        for content in self.contents:
            output += "   {:08X}   {:<7d} {:<14s} {:<11s}".format(
                content.cid,
                content.index,
                content.get_type(),
                utils.convert_size(content.size)
            )
            output += binascii.hexlify(content.sha256).decode() + "\n"

        # TODO: Improve this, is a bit complicated to understand and duplicated
        if self.certificates:
            output += "\n"
            output += "  Certificates:\n"
            try:
                signs_tmd = self.get_cert_by_name(self.get_issuer()[-1])  # CP
                signs_cp = self.get_cert_by_name(self.get_issuer()[1])  # CA
            except ValueError:
                output += "   Could not locate the needed certificates.\n"
                return output
            try:
                signs_ca = self.get_cert_by_name(self.get_issuer()[0])  # Root
            except ValueError:
                signs_ca = None

            # Check TMD signature
            verified = utils.Crypto.verify_signature(
                self.certificates[signs_tmd],
                self.signature_pack(),
                self.signature
            )
            sha256hash = utils.Crypto.create_sha256hash_hex(self.signature_pack())
            output += "    TMD signed by {0} using {1}:\n      {2} ".format(
                "-".join(self.get_issuer()),
                self.certificates[signs_tmd].get_key_type(),
                sha256hash
            )
            if verified:
                output += "[OK]"
            else:
                output += "[FAIL]"
            output += "\n"

            # Check CP signature
            verified = utils.Crypto.verify_signature(
                self.certificates[signs_cp],
                self.certificates[signs_tmd].signature_pack(),
                self.certificates[signs_tmd].signature
            )
            sha256hash = utils.Crypto.create_sha256hash_hex(self.certificates[signs_tmd].signature_pack())
            output += "    {0} ({1}) signed by {2} ({3}):\n      {4} ".format(
                self.certificates[signs_tmd].get_name(),
                self.certificates[signs_tmd].get_key_type(),
                self.certificates[signs_tmd].get_issuer(),
                self.certificates[signs_cp].get_key_type(),
                sha256hash
            )
            if verified:
                output += "[OK]"
            else:
                output += "[FAIL]"
            output += "\n"

            # Check Root signature
            if signs_ca:
                verified = utils.Crypto.verify_signature(
                    signs_ca,
                    self.certificates[signs_cp].signature_pack(),
                    self.certificates[signs_cp].signature
                )
                sha256hash = utils.Crypto.create_sha256hash_hex(self.certificates[signs_cp].signature_pack())
                output += "    {0} ({1}) signed by {2} ({3}):\n      {4} ".format(
                    self.certificates[signs_cp].get_name(),
                    self.certificates[signs_cp].get_key_type(),
                    self.certificates[signs_cp].get_issuer(),
                    ROOT_KEY.get_key_type(),
                    sha256hash
                )
                if verified:
                    output += "[OK]"
                else:
                    output += "[FAIL]"
            else:
                output += "    {0} ({1}) signed by {2}: Please place root-key in the same directory".format(
                    self.certificates[signs_cp].get_name(),
                    self.certificates[signs_cp].get_key_type(),
                    self.certificates[signs_cp].get_issuer()
                )
            output += "\n"

        return output
Esempio n. 13
0
    def run(self):
        sizecopied = 0
        if self.prt.tmp:
            fulltmp = self.prt.tmp
        else:
            fulltmp = self.fulldest

        params = {"access_token":self.prt.accesstoken, "path":self.base64path}
        
        log.info("%s size %s", self.nm, self.size_str)

        try:
            apidownloaduri = "%s%s?%s" % (BASE_URL, urllib.quote_plus(self.nm), urllib.urlencode(params))
        except KeyError:
            self.cleanUpAfterError("Error unsupported characters in filename %s" % self.nm, self.destpath)
        except: 
        #This technically should never happen but technically you never know
            self.cleanUpAfterError(traceback.format_exc(), self.destpath)

        if not os.path.exists(self.fulldest) or (self.prt.tmp and not os.path.exists(fulltmp)):
            self.cleanUpAfterError("Missing temp or destination parent directory", self.destpath)
        
        self.tmppath = os.path.join(fulltmp, self.nm)

        if self.prt.tmp:
            filehash = sha1("blob " + str(self.size_bytes) + "\0" + self.tmppath)
            tmpname = filehash.hexdigest()
            self.tmppath = os.path.join(self.prt.tmp, tmpname)

        log.debug("Downloading file to %s", self.tmppath)
        retriesleft = 3
        sizemismatched = False
        failsize = False
        while retriesleft > 0:
            sizecopied = 0
            progress = time.time() + 60
            apiratecount = 0
            try:
                with open(self.tmppath, 'wb') as tmpfile:
                    st = time.time()
                    timespan = 0
                    req = requests.get(apidownloaduri, stream=True, timeout=120)
                    chunk_size  = 1024
                    #if not sizemismatched:
                    chunk_size += 1024 * 1024
                    for chunk in req.iter_content(chunk_size=chunk_size):
                        sizecopied += len(chunk)
                        if self.prt.end:
                            break
                        if chunk: # filter out keep-alive new chunks
                            tmpfile.write(chunk)
                            if self.prt.progress and progress < time.time():
                                progress = time.time() + 60
                                speed = utils.get_speed(sizecopied, (time.time()-st))
                                log.info("%s\nDownloaded %s of %s at %s", self.nm, utils.convert_size(sizecopied), self.size_str, speed)
                            #if sizemismatched:
                            #    tmpfile.flush()
                            #    os.fsync(tmpfile.fileno())
                    timespan = (time.time()-st)
                if sizecopied != self.size_bytes and not self.prt.end:
                    raise SizeMismatchError("Download size mismatch downloaded %s expected %s" % (sizecopied, self.size_bytes))
                elif self.prt.end:
                    self.cleanUpAfterError("Recieved signaled stop during download", self.destpath)
            except (requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.RequestException):
                retriesleft -= 1
                if req.status_code == 429:
                    apiratecount += 1
                    retriesleft += 1
                    log.warn("API rate limit reached. Will retry")
                else:
                    log.warn("Network error. Will retry %s more times", retriesleft)
                if retriesleft > 0:
                    time.sleep(10 * apiratecount)
                else:
                    log.error("Error downloading %s", self.nm)
                    self.cleanUpAfterError("Maximum retries reached", self.destpath)
            except SizeMismatchError:
                retriesleft -= 1
                log.exception("%s File size mismatch. Will retry %s more times", self.nm, retriesleft)
                sizemismatched = True
                if retriesleft == 2:
                    failsize = sizecopied
                elif failsize and failsize != sizecopied:
                    failsize = False

                if retriesleft > 0:
                    time.sleep(10)
                elif failsize:
                    log.warn("%s\nRecieved incorrect file size %s instead of %s 3 times. Saving anyway", self.nm, sizecopied, self.size_bytes)
                else:
                    log.error("Error downloading %s", self.nm)
                    self.cleanUpAfterError("Maximum retries reached", self.destpath)
            except (IOError, OSError, WindowsError):
                log.exception("Error writing to file %s", self.nm)
                self.cleanUpAfterError(traceback.format_exc(), self.destpath)
            except (requests.exceptions.RequestException, Exception):
                log.error("Error downloading %s", self.nm)
                self.cleanUpAfterError(traceback.format_exc(), self.destpath)
            except SystemExit:
                self.cleanUpAfterError("Received signal exit", self.destpath)
                raise
            except:
                if req.status_code in [429, 503]:
                    apiratecount += 1
                    retriesleft += 1
                    log.warn("API rate limit reached. Will retry")
                else:
                    log.exception("Error downloading %s. Will retry %s more times", self.nm, retriesleft)

                if retriesleft > 0:
                    time.sleep(10 * apiratecount)
                else:
                    self.cleanUpAfterError("An unknown error occured", self.destpath)
            else:
                retriesleft = 0
                self.prt.downloadtime += timespan
                if self.prt.progress:
                    speed = utils.get_speed(self.size_bytes, timespan)
                    log.info("%s downloaded at %s", self.size_str, speed)

        if self.prt.end:
            log.warn("Parent signaled stop")
            return
        self.prt.bytestotal += self.size_bytes
        if self.prt.tmp:
            log.info("Copying from temp to dest")
            retriesleft = 3
            while retriesleft > 0:
                try:
                    st = time.time()
                    timespan = 0
                    with open(self.tmppath, 'rb') as f, open(self.destpath, "wb") as fo:
                        while True and not self.prt.end:
                            piece = f.read(1024)
                            if piece:
                                fo.write(piece)
                            else:
                                break

                    timespan = (time.time()-st)
                    if self.prt.end:
                        self.cleanUpAfterError("Recieved signaled stop during copy", self.destpath)
                except (IOError, OSError, WindowsError) as e:
                    retriesleft -= 1
                    if retriesleft > 0:
                        self.delete_dest()
                        log.exception("Error copying file wil retry %s more times", retriesleft)
                    else:
                        log.exception("Error file could not be copied to %s", self.destpath)
                        self.cleanUpAfterError(traceback.format_exc(), self.destpath)
                except SystemExit:
                    self.cleanUpAfterError("Received signal exit", self.destpath)
                    raise
                except: 
                    #This technically should never happen but technically you never know
                    retriesleft -= 1
                    if retriesleft > 0:
                        self.delete_dest()
                        log.exception("Error copying file wil retry %s more times", retriesleft)
                    else:
                        log.exception("Error file could not be copied to %s", self.destpath)
                        self.cleanUpAfterError(traceback.format_exc(), self.destpath)
                else:
                    retriesleft = 0
                    self.prt.copytime += timespan
                    if self.prt.progress:
                        speed = utils.get_speed(self.size_bytes, timespan)
                        log.info("%s copied at %s", self.size_str, speed)
                    try:
                        os.remove(self.tmppath)
                    except (IOError, OSError) as e:
                        log.warn("Failed cleaning up tmp file %s\n%s", self.tmppath, e)
        self.prt.writeSuccess(self.destpath)
        log.info("Finished download %s in ", self.destpath)
        log.debug("End of thread")
        self.prt.threads[self.thread_num] = None
Esempio n. 14
0
def main(titleid,
         titlever=None,
         pack_as_wad=True,
         decryptcontents=False,
         localuse=True,
         keepcontents=True,
         enc_titlekey=None,
         onlyticket=False,
         cdndir=False,
         base_url="http://nus.cdn.shop.wii.com/ccs/download"):
    if len(titleid) != 16:
        print("Title ID must be 16 characters long.")
        return
    try:
        int(titleid, 16)
    except ValueError:
        print("Title ID must be in hexadecimal.")
        return

    if onlyticket and not enc_titlekey:
        print(
            "Please specify an ecrypted titlekey (--key) for Ticket generation."
        )
        return

    if enc_titlekey:
        if len(enc_titlekey) != 32:
            print("Encrypted title key must be 32 characters long.")
            return
        try:
            int(enc_titlekey, 16)
        except ValueError:
            print("Title key must be in hexadecimal.")
            return

    if not pack_as_wad and not keepcontents and not decryptcontents:
        print("Running with these settings would produce no output.")
        return

    titleid = titleid.lower()
    nus = WADGEN.NUS(titleid, titlever, base=base_url)

    if onlyticket:
        print("Generating Ticket for Title {0} v{1}".format(
            titleid, "[Latest]" if titlever == None else titlever))
    else:
        print("Downloading Title {0} v{1}".format(
            titleid, "[Latest]" if titlever == None else titlever))

    # Download TMD
    print("* Downloading TMD...")
    try:
        tmd = nus.tmd
    except HTTPError:
        print("Title not on NUS!")
        return

    # Parse TMD
    print("* Parsing TMD...")
    total_size = 0
    for content in tmd.contents:
        total_size += content.size
    print("    Title Version: {0}".format(tmd.hdr.titleversion))
    print("    {0} Content{1}: {2}".format(
        len(tmd.contents), "s" if len(tmd.contents) > 1 else "",
        utils.convert_size(total_size)))

    if titlever == None:
        titlever = tmd.hdr.titleversion
    else:
        if titlever != tmd.hdr.titleversion:
            print("    WARNING: Title version should be {0} but is {1}".format(
                titleid, tmd.hdr.titleversion))

    if titleid != tmd.get_titleid():
        print(
            "    WARNING: Title ID should be {0} but is {1} (ignore for vWii)".
            format(titleid, tmd.get_titleid()))

    if cdndir:
        titlepath = os.path.join("titles", titleid)
    else:
        titlepath = os.path.join("titles", titleid, str(titlever))
    if not os.path.isdir(titlepath):
        os.makedirs(titlepath)
    if not onlyticket:
        if cdndir:
            tmd.dump(os.path.join(titlepath, "tmd.{0}".format(titlever)))
        else:
            tmd.dump(os.path.join(titlepath, "tmd"))

    # Download Ticket
    if enc_titlekey:
        # TODO: Generate DSi tickets
        print("* Generating Ticket...")
        cetk = WADGEN.Ticket(tickettemplate)
        cetk.hdr.titleid = tmd.hdr.titleid
        cetk.hdr.titleversion = tmd.hdr.titleversion
        cetk.hdr.titlekey = binascii.a2b_hex(enc_titlekey)
        if tmd.get_region() == "Korea" and not tmd.get_titleid().startswith(
                "00030"):  # Korea + not DSi
            cetk.hdr.ckeyindex = 1  # Korean common-key index
        elif titleid.startswith("00000007") or titleid.startswith(
                "0007"):  # Wii U Wii Mode
            cetk.hdr.ckeyindex = 2  # vWii common-key index
        cetk.dump(os.path.join(titlepath, "cetk"))
        if localuse:  # We need to set Title IV and decrypt the titlekey for verifying
            cetk.titleiv = struct.pack(">Q", cetk.hdr.titleid) + b"\x00" * 8
            cetk.decrypted_titlekey = utils.Crypto.decrypt_titlekey(
                commonkey=cetk.get_decryption_key(),
                iv=cetk.titleiv,
                titlekey=cetk.hdr.titlekey)

        if onlyticket:
            print("Finished.")
            return
    else:
        print("* Downloading Ticket...")
        cetk = nus.ticket
        if not cetk:
            if pack_as_wad:
                print("    Ticket unavailable, can't pack nor verify.")
                pack_as_wad = False
            else:
                print("    Ticket unavailable, can't verify download.")
        else:
            cetk.dump(os.path.join(titlepath, "cetk"))

    if decryptcontents and not keepcontents and not cetk:
        print(
            "Aborted, because contents should be deleted and decrypting is not possible."
        )
        return

    # Download Contents
    print("* Downloading Contents...")
    for i, content_url in enumerate(nus.get_content_urls()):
        print("    Content #{0} of #{1}: {2} ({3})".format(
            i + 1, tmd.hdr.contentcount, tmd.contents[i].get_cid(),
            utils.convert_size(tmd.contents[i].size)))
        content_path = os.path.join(titlepath, tmd.contents[i].get_cid())

        # Local Use
        if localuse and cetk:
            if os.path.isfile(content_path):
                with open(content_path, "rb") as content_file:
                    valid, decdata = utils.Crypto.check_content_hash(
                        tmd.contents[i],
                        cetk,
                        content_file.read(),
                        return_decdata=True)
                    if valid:
                        print("      Content exists and has been verified!")
                        if decryptcontents:
                            print("      Decrypting...")
                            with open(content_path + ".app",
                                      "wb") as decrypted_content_file:
                                decrypted_content_file.write(decdata)
                        continue  # Go on with the next content
                    else:
                        print(
                            "      Content exists, but hash check failed - redownloading..."
                        )

        # Start content download by sending GET request (in stream mode)
        content_data = b''
        with http_session.get(content_url, timeout=30, stream=True) as req:
            # Check status code and content length element
            content_total_length = req.headers.get('content-length')
            if req.status_code != 200 or content_total_length is None:
                print("      Failed to download content")
                return

            # Download data in chunks
            progress = 0
            content_total_length = int(content_total_length)
            content_total_length_str = utils.convert_size(content_total_length)
            for chunk in req.iter_content(chunk_size=(1024 * 256)):
                if not chunk:
                    break

                content_data += chunk
                progress += len(chunk)
                percentage = int(progress * 100 / content_total_length)
                bar_length = int(percentage / 2)

                sys.stdout.write("\r%s" % (' ' * 100))
                sys.stdout.flush()

                if progress < content_total_length:
                    sys.stdout.write(
                        "\r      %u%% [%s%s] %s / %s" %
                        (percentage, '=' * bar_length, ' ' *
                         (50 - bar_length), utils.convert_size(progress),
                         content_total_length_str))
                else:
                    sys.stdout.write("\r      100%% [%s] %s / %s" %
                                     ('=' * 50, content_total_length_str,
                                      content_total_length_str))
                sys.stdout.flush()

        sys.stdout.write("\r%s\r" % (' ' * 100))
        sys.stdout.flush()

        # Verify after download
        if cetk:
            valid, decdata = utils.Crypto.check_content_hash(
                tmd.contents[i], cetk, content_data, return_decdata=True)
            if not valid:
                print("      Hash check failed.")
                return
            if decryptcontents:
                print("      Decrypting...")
                with open(content_path + ".app",
                          "wb") as decrypted_content_file:
                    decrypted_content_file.write(decdata)

        with open(content_path, 'wb') as content_file:
            content_file.write(content_data)

    # Pack as WAD
    if pack_as_wad:
        if not cetk.get_titleid().startswith("00030"):
            print("* Creating WAD...")
            wad_path = os.path.join(titlepath,
                                    "{0}-v{1}.wad".format(titleid, titlever))
            if cdndir:
                WADGEN.WADMaker(titlepath, titlever=titlever).dump(wad_path)
            else:
                WADGEN.WADMaker(titlepath).dump(wad_path)
            if not os.path.isfile(wad_path):
                print("    WAD creation failed.")
            else:
                print("    WAD creation successful: {0}".format(wad_path))
    else:
        print("Finished.")

    if not keepcontents:
        if cdndir:
            os.remove(os.path.join(titlepath, "tmd.{0}".format(titlever)))
        else:
            os.remove(os.path.join(titlepath, "tmd"))
        try:
            os.remove(os.path.join(titlepath, "cetk"))
        except FileNotFoundError:
            pass
        for content in tmd.contents:
            os.remove(os.path.join(titlepath, content.get_cid()))
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'PyTorch Variational Autoencoders for Collaborative Filtering')
    parser.add_argument(
        '--json_file',
        type=str,
        default='C:\\Users\iyeshuru\Downloads\dblp_papers_v11.txt',
        help='Processed input h5 file.')
    parser.add_argument('--limit',
                        type=int,
                        default=None,
                        help='Limit number of data to process.')
    parser.add_argument('--release',
                        action='store_true',
                        help='Build only training set (no validation/test)')
    args = parser.parse_args()

    # json_file = 'C:\\Users\iyeshuru\PycharmProjects\PapersProject\\flow\dblp.cut'
    # json_file = 'C:\\Users\iyeshuru\PycharmProjects\PapersProject\\flow\dblp.large.cut'
    # json_file = 'C:\\Users\iyeshuru\Downloads\dblp_papers_v11.txt'
    json_file = 'C:\\Users\iyeshuru\PycharmProjects\PapersProject\\flow\dblp_test.txt'
    # json_file = args.json_file

    args.release = True
    args.limit = 1000

    DATA_DIR = 'data/'

    # index2paper
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)

    raw_output = os.path.join(DATA_DIR, 'raw_output.h5')
    processed_output_file = os.path.join(DATA_DIR, 'processed_output.h5')
    embeddings_output_file = os.path.join(DATA_DIR, 'embeddings_output.h5')
    author2idx_pickle = os.path.join(DATA_DIR, 'author2idx.pickle')
    idx2author_pickle = os.path.join(DATA_DIR, 'idx2author.pickle')
    paper2embedding_idx_pickle = os.path.join(DATA_DIR,
                                              'paper2embedding_idx.pickle')

    # Process data
    create_paper_author_score_triples(json_file, raw_output, args.limit)

    paper2idxs = {}

    # Create paper id to indexes (group)
    warnings.warn("Loading all paper IDs into memory.")
    with h5py.File(raw_output, 'r') as f:
        for i, (paper, _, _) in tqdm(enumerate(f['paper_author_score']),
                                     total=len(f['paper_author_score']),
                                     desc='Building paper2idxs mapping'):
            if paper not in paper2idxs:
                paper2idxs[paper] = []
            paper2idxs[paper].append(i)

    unique_papers_count = len(paper2idxs.keys())
    n_heldout_users = int(unique_papers_count * 0.2)

    # Split Train/Validation/Test User Indices
    if args.release:
        print("Release: building only train set.")
        tr_papers_index_range = [0, unique_papers_count]
        vd_papers_index_range = [0, 0]
        te_papers_index_range = [0, 0]
    else:
        tr_papers_index_range = [0, unique_papers_count - n_heldout_users * 2]
        vd_papers_index_range = [
            unique_papers_count - n_heldout_users * 2,
            unique_papers_count - n_heldout_users
        ]
        te_papers_index_range = [
            unique_papers_count - n_heldout_users, unique_papers_count
        ]

    for dataset, index_range in zip(["Train", "Validation", "Test"], [
            tr_papers_index_range,
            vd_papers_index_range,
            te_papers_index_range,
    ]):
        print("{} papers: {}".format(dataset, index_range[1] - index_range[0]))

    warnings.warn("Loading all paper indexes into memory.")
    ranges = [
        tr_papers_index_range, vd_papers_index_range, te_papers_index_range
    ]

    tr_indexes = [
        item for sublist in list(paper2idxs.values())
        [tr_papers_index_range[0]:tr_papers_index_range[1]] for item in sublist
    ]

    unique_train_authors = unique(raw_output, 1, tr_indexes)
    unique_train_authors_count = len(unique_train_authors)

    author2idx = dict((pid, i) for (i, pid) in enumerate(unique_train_authors))

    with h5py.File(raw_output,
                   'r') as raw_f, h5py.File(processed_output_file,
                                            'w') as processed_f:

        ################## Add Paper, author, scores data ##############################
        paper_author_score_ds = raw_f['paper_author_score']

        # Save mapping to ds
        with open(author2idx_pickle, 'wb') as handle:
            pickle.dump(author2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

        print("Saved {} (size: {})".format(
            author2idx_pickle,
            convert_size(os.path.getsize(author2idx_pickle))))

        # Save reverse mapping to ds (used in inference)
        idx2author = {}
        for author, idx in tqdm(author2idx.items(),
                                total=len(author2idx),
                                desc='Saving reverse mapping'):
            idx2author[idx] = author

        with open(idx2author_pickle, 'wb') as handle:
            pickle.dump(idx2author, handle, protocol=pickle.HIGHEST_PROTOCOL)

        print("Saved {} (size: {})".format(
            idx2author_pickle,
            convert_size(os.path.getsize(idx2author_pickle))))

        del idx2author
        gc.collect()

        train_grp = processed_f.create_group('train')
        validation_grp = processed_f.create_group('validation')
        test_grp = processed_f.create_group('test')

        # Creating datasets and buffers for all train/validation/test ->train/test combinations
        list_pairs = list(
            map(
                lambda grp: [
                    DatasetBuffer(
                        grp.create_dataset(
                            'train',
                            maxshape=paper_author_score_ds.shape,
                            shape=(0, paper_author_score_ds.shape[1]),
                            chunks=paper_author_score_ds.chunks,
                            dtype=paper_author_score_ds.dtype)),
                    DatasetBuffer(
                        grp.create_dataset(
                            'test',
                            maxshape=paper_author_score_ds.shape,
                            shape=(0, paper_author_score_ds.shape[1]),
                            chunks=paper_author_score_ds.chunks,
                            dtype=paper_author_score_ds.dtype))
                ], [train_grp, validation_grp, test_grp]))

        buffers = [item for sublist in list_pairs for item in sublist]
        # tr_tr, tr_te, val_tr, val_te, test_te, test_tr = buffers

        # Filter by author, group by paper and split by proportion.
        for i, ((tr_tr, tr_te),
                index_range) in enumerate(zip(list_pairs, ranges)):
            print("Building dataset: {}".format(
                ['Train', 'Validation', 'Test'][i]))
            # indexes = [item for sublist in list(paper2idxs.items())[index_range[0]:index_range[1]] for item in sublist]
            add_data(paper_author_score_ds,
                     tr_tr,
                     tr_te,
                     index_range,
                     paper2idxs,
                     author2idx,
                     test_proportion=None if i == 0 else 0.2)

        for buffer in buffers:
            buffer.close()

    ########## Add title embeddings #######################
    # TODO: Pass last two args differnetly...
    from get_embeddings import collect_embeddings
    # collect_embeddings(json_file, embeddings_output_file, args.limit, CHUNK_SIZE, skip_paper)
    collect_embeddings(paper2embedding_idx_pickle, json_file,
                       unique_papers_count, embeddings_output_file, args.limit,
                       CHUNK_SIZE, skip_paper)

    print("Done!")