def train(power=6.46): dat_path = os.path.join("_Data", "dataset.dat") gen_dataset(dat_path) with open(dat_path, "rb") as _file: x, y = pickle.load(_file) _indices = np.random.permutation(len(x)) x = [x[i] for i in _indices] y = [y[i] for i in _indices] data_len = len(x) batch_size = math.ceil(data_len * 0.1) _test_sets, _prob_lists = [], [] _total = sum([len(sentence) for sentence in x]) for i in range(10): rs = [[] for _ in range(9)] _next = (i + 1) * batch_size if i != 9 else data_len x_train = x[:i * batch_size] + x[(i + 1) * batch_size:] y_train = y[:i * batch_size] + y[(i + 1) * batch_size:] x_test, y_test = x[i * batch_size:_next], y[i * batch_size:_next] for xx, yy in zip(x_train, y_train): rs[yy] += xx _counters = [Counter(group) for group in rs] _test_sets.append((x_test, y_test)) _prob_lst = [] for counter in _counters: _sum = sum(counter.values()) _prob_lst.append( {key: value / _sum for key, value in counter.items()}) _prob_lst[-1]["null"] = _sum * 2**power _prob_lst[-1]["prior"] = _sum / _total _prob_lists.append(_prob_lst) return _test_sets, _prob_lists
def main(clf): dat_path = os.path.join("_Data", "dataset.dat") gen_dataset(dat_path) with open(dat_path, "rb") as _file: x, y = pickle.load(_file) x = [" ".join(sentence) for sentence in x] _indices = np.random.permutation(len(x)) x = list(np.array(x)[_indices]) y = list(np.array(y)[_indices]) data_len = len(x) batch_size = math.ceil(data_len * 0.1) _acc_lst, y_results = [], [] bar = ProgressBar(max_value=10, name=str(clf)) for i in range(10): _next = (i + 1) * batch_size if i != 9 else data_len x_train = x[:i * batch_size] + x[(i + 1) * batch_size:] y_train = y[:i * batch_size] + y[(i + 1) * batch_size:] x_test, y_test = x[i * batch_size:_next], y[i * batch_size:_next] count_vec = CountVectorizer() counts_train = count_vec.fit_transform(x_train) x_test = count_vec.transform(x_test) tfidf_transformer = TfidfTransformer() x_train = tfidf_transformer.fit_transform(counts_train) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) _acc_lst.append(clf.acc(y_test, y_pred)) y_results.append([y_test, y_pred]) del x_train, y_train, x_test, y_test, y_pred bar.update() return _acc_lst, y_results
def train(power=6.46): dat_path = os.path.join("_Data", "dataset.dat") gen_dataset(dat_path) with open(dat_path, "rb") as _file: x, y = pickle.load(_file) _indices = np.random.permutation(len(x)) x = [x[i] for i in _indices] y = [y[i] for i in _indices] data_len = len(x) batch_size = math.ceil(data_len*0.1) _test_sets, _prob_lists = [], [] _total = sum([len(sentence) for sentence in x]) for i in range(10): rs = [[] for _ in range(9)] _next = (i+1)*batch_size if i != 9 else data_len x_train = x[:i * batch_size] + x[(i + 1) * batch_size:] y_train = y[:i * batch_size] + y[(i + 1) * batch_size:] x_test, y_test = x[i*batch_size:_next], y[i*batch_size:_next] for xx, yy in zip(x_train, y_train): rs[yy] += xx _counters = [Counter(group) for group in rs] _test_sets.append((x_test, y_test)) _prob_lst = [] for counter in _counters: _sum = sum(counter.values()) _prob_lst.append({ key: value / _sum for key, value in counter.items() }) _prob_lst[-1]["null"] = _sum * 2 ** power _prob_lst[-1]["prior"] = _sum / _total _prob_lists.append(_prob_lst) return _test_sets, _prob_lists
def main(clf): dat_path = os.path.join("_Data", "dataset.dat") gen_dataset(dat_path) with open(dat_path, "rb") as _file: x, y = pickle.load(_file) x = [" ".join(sentence) for sentence in x] _indices = np.random.permutation(len(x)) x = list(np.array(x)[_indices]) y = list(np.array(y)[_indices]) data_len = len(x) batch_size = math.ceil(data_len * 0.1) acc_lst, y_results = [], [] bar = ProgressBar(max_value=10, name=str(clf)) for i in range(10): _next = (i + 1) * batch_size if i != 9 else data_len x_train = x[:i * batch_size] + x[(i + 1) * batch_size:] y_train = y[:i * batch_size] + y[(i + 1) * batch_size:] x_test, y_test = x[i * batch_size:_next], y[i * batch_size:_next] count_vec = CountVectorizer() counts_train = count_vec.fit_transform(x_train) x_test = count_vec.transform(x_test) tfidf_transformer = TfidfTransformer() x_train = tfidf_transformer.fit_transform(counts_train) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) acc_lst.append(clf.acc(y_test, y_pred)) y_results.append([y_test, y_pred]) del x_train, y_train, x_test, y_test, y_pred bar.update() return acc_lst, y_results