Exemple #1
0
    def random_trains(self, replace):
        root_dir = "folder/{}_{}/{}/{}/{}/".format(
            self.dataset, self.args.maxl, self.args.shuffle_seed,
            "random" if replace else "append", self.args.nr
        )

        os.makedirs(root_dir, exist_ok=True)
        random_text = root_dir + "random.txt"
        if not os.path.isfile(random_text):
            print("# generate random training samples " + random_text)
            self.train_rnd = [self.random_str(self.M) for _ in range(self.args.nr)]
            if not replace:
                print("# appended to training samples " + random_text)
                self.train_rnd =  [self.lines[i] for i in self.train_ids] + self.train_rnd
            with open(random_text, "w") as w:
                w.writelines("%s\n" % line for line in self.train_rnd)
            self.train_dist, self.train_knn = get_dist_knn(self.train_rnd)
            np.save(root_dir + "random_train_dist.npy", self.train_dist)
            np.save(root_dir + "random_train_knn.npy", self.train_knn)
        else:
            print("# loading random training samples " + random_text)
            self.train_rnd = readlines(random_text.format(self.args.dataset))
            self.train_dist = np.load(root_dir + "random_train_dist.npy")
            self.train_knn = np.load(root_dir + "random_train_knn.npy")

        _, _, train_sig, alphabet = word2sig(lines=self.train_rnd, max_length=self.M)
        self.xt = StringDataset(self.C, self.M, train_sig)
Exemple #2
0
    def __init__(self, args, data_f):
        self.data_f = data_f
        self.args = args
        self.nt = args.nt
        self.nq = args.nq
        self.maxl = args.maxl
        self.dataset = args.dataset

        self.lines = readlines("data/{}".format(args.dataset))
        if self.maxl != 0:
            self.lines = [l[: self.maxl] for l in self.lines]
        self.ni = len(self.lines)
        self.nb = self.ni - self.nq - self.nt

        start_time = time.time()
        self.C, self.M, self.char_ids, self.alphabet = word2sig(self.lines, max_length=None)
        print("# Loading time: {}".format(time.time() - start_time))

        self.load_ids()
        self.load_dist()

        self.xt = StringDataset(
            self.C, self.M, [self.char_ids[i] for i in self.train_ids]
        )
        self.xq = StringDataset(
            self.C, self.M, [self.char_ids[i] for i in self.query_ids]
        )
        self.xb = StringDataset(
            self.C, self.M, [self.char_ids[i] for i in self.base_ids]
        )
Exemple #3
0
    def random_trains(self, replace):
        root_dir = "folder/{}_{}/{}/{}/{}/".format(
            self.dataset, self.args.maxl, self.args.shuffle_seed,
            "random" if replace else "append", self.args.nr)

        os.makedirs(root_dir, exist_ok=True)
        random_text = root_dir + "random.txt"
        if not os.path.isfile(random_text):
            print("# generate random training samples " + random_text)
            tmp_alphabet = self.alphabet
            self.alphabet = self.alphabet[:26 *
                                          2]  # let it be only a-z and A-Z
            first_nr = round(self.args.nr * 9 / 10)
            print("# Using " + self.alphabet + " to generate " +
                  str(first_nr) + " strings")
            first_train_rnd = [
                self.random_str(self.M) for _ in range(first_nr)
            ]
            self.alphabet = tmp_alphabet
            print("# Using " + self.alphabet + " to generate " +
                  str(self.args.nr - first_nr) + " strings")
            self.train_rnd = first_train_rnd + [
                self.random_str(self.M) for _ in range(self.args.nr - first_nr)
            ]
            if not replace:
                print("# appended to training samples " + random_text)
                self.train_rnd = [self.lines[i]
                                  for i in self.train_ids] + self.train_rnd
            print("# final training samples: ", len(self.train_rnd))
            with open(random_text, "w") as w:
                w.writelines("%s\n" % line for line in self.train_rnd)
            self.train_dist, self.train_knn = get_dist_knn(self.train_rnd)
            np.save(root_dir + "random_train_dist.npy", self.train_dist)
            np.save(root_dir + "random_train_knn.npy", self.train_knn)
        else:
            print("# loading random training samples " + random_text)
            self.train_rnd = readlines(random_text.format(self.args.dataset))
            self.train_dist = np.load(root_dir + "random_train_dist.npy")
            self.train_knn = np.load(root_dir + "random_train_knn.npy")

        _, _, train_sig, alphabet = word2sig(lines=self.train_rnd,
                                             max_length=self.M)
        self.xt = StringDataset(self.C, self.M, train_sig)
Exemple #4
0
    def __init__(self, args, data_f):
        self.data_f = data_f
        self.args = args
        self.nt = args.nt
        self.nq = args.nq
        self.maxl = args.maxl
        self.dataset = args.dataset

        self.lines = readlines("data/{}".format(args.dataset))

        if self.maxl != 0:
            self.lines = [l[:self.maxl] for l in self.lines]
        self.ni = len(self.lines)
        self.lines = [chr(i + ord('a')) for i in range(26)
                      ] + [chr(i + ord('A')) for i in range(26)] + self.lines
        print("Add basic characters (a-z) and (A-Z) into the data")

        self.nb = self.ni - self.nq - self.nt
        self.pre_mappings = None
        if args.pre_mappings != None:
            with open(args.pre_mappings, 'rb') as f:
                mappings = pickle.load(f)
            self.pre_mappings = mappings["char_to_id"]
            print("# Load pre calculated mappings from " + args.pre_mappings)
            print("# Loaded chars " +
                  str(''.join(mappings["char_to_id"].keys())))

        start_time = time.time()
        self.C, self.M, self.char_ids, self.alphabet = word2sig(
            self.lines, max_length=None, pre_alphabet=self.pre_mappings)
        print("# Loading time: {}".format(time.time() - start_time))
        print("# Alphabet: ", ''.join(sorted(list(self.alphabet))))
        self.load_ids()
        self.load_dist()

        self.xt = StringDataset(self.C, self.M,
                                [self.char_ids[i] for i in self.train_ids])
        self.xq = StringDataset(self.C, self.M,
                                [self.char_ids[i] for i in self.query_ids])
        self.xb = StringDataset(self.C, self.M,
                                [self.char_ids[i] for i in self.base_ids])