def random_trains(self, replace): root_dir = "folder/{}_{}/{}/{}/{}/".format( self.dataset, self.args.maxl, self.args.shuffle_seed, "random" if replace else "append", self.args.nr ) os.makedirs(root_dir, exist_ok=True) random_text = root_dir + "random.txt" if not os.path.isfile(random_text): print("# generate random training samples " + random_text) self.train_rnd = [self.random_str(self.M) for _ in range(self.args.nr)] if not replace: print("# appended to training samples " + random_text) self.train_rnd = [self.lines[i] for i in self.train_ids] + self.train_rnd with open(random_text, "w") as w: w.writelines("%s\n" % line for line in self.train_rnd) self.train_dist, self.train_knn = get_dist_knn(self.train_rnd) np.save(root_dir + "random_train_dist.npy", self.train_dist) np.save(root_dir + "random_train_knn.npy", self.train_knn) else: print("# loading random training samples " + random_text) self.train_rnd = readlines(random_text.format(self.args.dataset)) self.train_dist = np.load(root_dir + "random_train_dist.npy") self.train_knn = np.load(root_dir + "random_train_knn.npy") _, _, train_sig, alphabet = word2sig(lines=self.train_rnd, max_length=self.M) self.xt = StringDataset(self.C, self.M, train_sig)
def __init__(self, args, data_f): self.data_f = data_f self.args = args self.nt = args.nt self.nq = args.nq self.maxl = args.maxl self.dataset = args.dataset self.lines = readlines("data/{}".format(args.dataset)) if self.maxl != 0: self.lines = [l[: self.maxl] for l in self.lines] self.ni = len(self.lines) self.nb = self.ni - self.nq - self.nt start_time = time.time() self.C, self.M, self.char_ids, self.alphabet = word2sig(self.lines, max_length=None) print("# Loading time: {}".format(time.time() - start_time)) self.load_ids() self.load_dist() self.xt = StringDataset( self.C, self.M, [self.char_ids[i] for i in self.train_ids] ) self.xq = StringDataset( self.C, self.M, [self.char_ids[i] for i in self.query_ids] ) self.xb = StringDataset( self.C, self.M, [self.char_ids[i] for i in self.base_ids] )
def random_trains(self, replace): root_dir = "folder/{}_{}/{}/{}/{}/".format( self.dataset, self.args.maxl, self.args.shuffle_seed, "random" if replace else "append", self.args.nr) os.makedirs(root_dir, exist_ok=True) random_text = root_dir + "random.txt" if not os.path.isfile(random_text): print("# generate random training samples " + random_text) tmp_alphabet = self.alphabet self.alphabet = self.alphabet[:26 * 2] # let it be only a-z and A-Z first_nr = round(self.args.nr * 9 / 10) print("# Using " + self.alphabet + " to generate " + str(first_nr) + " strings") first_train_rnd = [ self.random_str(self.M) for _ in range(first_nr) ] self.alphabet = tmp_alphabet print("# Using " + self.alphabet + " to generate " + str(self.args.nr - first_nr) + " strings") self.train_rnd = first_train_rnd + [ self.random_str(self.M) for _ in range(self.args.nr - first_nr) ] if not replace: print("# appended to training samples " + random_text) self.train_rnd = [self.lines[i] for i in self.train_ids] + self.train_rnd print("# final training samples: ", len(self.train_rnd)) with open(random_text, "w") as w: w.writelines("%s\n" % line for line in self.train_rnd) self.train_dist, self.train_knn = get_dist_knn(self.train_rnd) np.save(root_dir + "random_train_dist.npy", self.train_dist) np.save(root_dir + "random_train_knn.npy", self.train_knn) else: print("# loading random training samples " + random_text) self.train_rnd = readlines(random_text.format(self.args.dataset)) self.train_dist = np.load(root_dir + "random_train_dist.npy") self.train_knn = np.load(root_dir + "random_train_knn.npy") _, _, train_sig, alphabet = word2sig(lines=self.train_rnd, max_length=self.M) self.xt = StringDataset(self.C, self.M, train_sig)
def __init__(self, args, data_f): self.data_f = data_f self.args = args self.nt = args.nt self.nq = args.nq self.maxl = args.maxl self.dataset = args.dataset self.lines = readlines("data/{}".format(args.dataset)) if self.maxl != 0: self.lines = [l[:self.maxl] for l in self.lines] self.ni = len(self.lines) self.lines = [chr(i + ord('a')) for i in range(26) ] + [chr(i + ord('A')) for i in range(26)] + self.lines print("Add basic characters (a-z) and (A-Z) into the data") self.nb = self.ni - self.nq - self.nt self.pre_mappings = None if args.pre_mappings != None: with open(args.pre_mappings, 'rb') as f: mappings = pickle.load(f) self.pre_mappings = mappings["char_to_id"] print("# Load pre calculated mappings from " + args.pre_mappings) print("# Loaded chars " + str(''.join(mappings["char_to_id"].keys()))) start_time = time.time() self.C, self.M, self.char_ids, self.alphabet = word2sig( self.lines, max_length=None, pre_alphabet=self.pre_mappings) print("# Loading time: {}".format(time.time() - start_time)) print("# Alphabet: ", ''.join(sorted(list(self.alphabet)))) self.load_ids() self.load_dist() self.xt = StringDataset(self.C, self.M, [self.char_ids[i] for i in self.train_ids]) self.xq = StringDataset(self.C, self.M, [self.char_ids[i] for i in self.query_ids]) self.xb = StringDataset(self.C, self.M, [self.char_ids[i] for i in self.base_ids])