def main(): t0 = time.time() print("Fetching training and testing datasets..") tr = load_txt("data/train_set.txt") tr_x = tr[0] tr_y = tr[1] ts_x_raw = load_xlsx("data/test_set.xlsx") ts_x = [row["A"] for row in ts_x_raw] ts_y_raw = load_txt("data/test_set_y.txt") ts_y = ts_y_raw[1] ts_y = ts_y[0:len(ts_y) - 1] # because there's a new line at the end if compare_datasets: # Check our test labels against Eysteinn's ts_y_alternate = load_csv("data/test_dataset.csv") different = [] for i in range(len(ts_y_alternate)): if ts_y[i] is not ts_y_alternate[i]: different.append(i) print("Number of different entries:") print(len(different)) print(different) print("Creating features from training set..") vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), lowercase=True) tr_vectors = vectorizer.fit_transform(tr_x) print("Creating MultinomialNB classifier..") clf = MultinomialNB() clf.fit(tr_vectors, tr_y) ts_x_featurized = vectorizer.transform(ts_x) print("Making predictions..") predictions = clf.predict(ts_x_featurized) t1 = time.time() dt = t1 - t0 i = 0 correct_predictions = 0 for row in predictions: if row == ts_y[i]: correct_predictions = correct_predictions + 1 i = i + 1 print("Result: %d/%d correct predictions (%.2f%%), in %.2fs.\n" % (correct_predictions, len(predictions), 100. * correct_predictions / len(predictions), dt)) print(classification_report(ts_y, predictions))
def _inner_load_stopwords(): # from stop_words python package pyth_fr = get_stop_words('fr') pyth_de = get_stop_words('de') pyth_en = get_stop_words('en') pyth_it = get_stop_words('it') # from Peter Graham on GitHub graham_fr = load_txt('stop_words/smart_stop_words_fr.txt') graham_de = load_txt('stop_words/smart_stop_words_de.txt') graham_en = load_txt('stop_words/smart_stop_words_en.txt') graham_it = load_txt('stop_words/smart_stop_words_it.txt') union = list(set().union(pyth_fr, pyth_de, pyth_en, pyth_it, graham_fr, graham_de, graham_en, graham_it)) return union
def __init__(self, ids_file, x_dir, y_dir, get_label=True, x_suffix='.jpg', y_suffix='.png', transforms=None, seed=None, verbose=False): """ Args: - ids_file: path to a text file containing (only) samples' ids in each line - eg: '0000\n0201\n0299' - x_dir (str or Path): path to the directory containing data images (x) - y_dir (str or Path): path to the directory containing label images (y) - get_label (bool): if True, returns `y` (as np.array of shape (h,w,1) in addition to `x` - transform (tuple or None): a tuple of transforms to apply to `x` and `y`, respectively """ # Read data line by line self.ids = load_txt(ids_file) self.x_dir = Path(x_dir) if not isinstance(x_dir, Path) else x_dir self.y_dir = Path(y_dir) if not isinstance(y_dir, Path) else y_dir self.x_suffix, self.y_suffix = x_suffix, y_suffix self.get_label = get_label self.transforms = transforms self.seed = seed if verbose: print("Created dataset: ", len(self.ids), '[', self.ids[0], self.ids[-1], ']') print('\txdir: ', self.x_dir) print('\tydir: ', self.y_dir) print('\tnumber of ids: ', len(self.ids))
def load(self, filename): self.filter_list = load_txt(filename)
def main(): t0 = time.time() print("Fetching training and testing datasets..") tr = load_txt("data/train_set.txt") tr_x = tr[0] tr_y = tr[1] ts_x_raw = load_xlsx("data/test_set.xlsx") ts_x = [row["A"] for row in ts_x_raw] ts_y_raw = load_txt("data/test_set_y.txt") ts_y = ts_y_raw[1] ts_y = ts_y[0:len(ts_y) - 1] # because there's a new line at the end if compare_datasets: # Check our test labels against Eysteinn's ts_y_alternate = load_csv("data/test_dataset.csv") different = [] for i in range(len(ts_y_alternate)): if ts_y[i] is not ts_y_alternate[i]: different.append(i) print("Number of different entries:") print(len(different)) print(different) print("Creating features from training set..") vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), lowercase=True) tr_vectors = vectorizer.fit_transform(tr_x) print("Grid searching params for SVM classifier..") #clf = MultinomialNB() params = { 'kernel': ('linear', 'poly', 'rbf'), 'C': [1, 10], 'degree': [2, 3, 4, 5], 'coef0': [5, 7, 10, 15, 17, 20] } #params = {'kernel':['poly'], 'C':[10], 'degree':[3], 'coef0':[5]} bestclf = 0 bestRes = 0 bestPredictions = 0 for classifier in params['kernel']: for c in params['C']: for d in params['degree']: for coef in params['coef0']: clf = svm.SVC(kernel=classifier, C=c, degree=d, coef0=coef) clf.fit(tr_vectors, tr_y) ts_x_featurized = vectorizer.transform(ts_x) predictions = clf.predict(ts_x_featurized) t1 = time.time() i = 0 correct_predictions = 0 for row in predictions: if row == ts_y[i]: correct_predictions = correct_predictions + 1 i = i + 1 if correct_predictions > bestRes: bestRes = correct_predictions bestclf = clf bestPredictions = predictions print('kernel:', classifier, 'C:', c, 'degree:', d, 'coef0:', coef) print('Numcorrect', bestRes) dt = t1 - t0 print("Result: %d/%d correct predictions (%.2f%%), in %.2fs.\n" % (bestRes, len(bestPredictions), 100. * bestRes / len(bestPredictions), dt)) print(classification_report(ts_y, bestPredictions))