def load(max_examples=None): try: download("movie_reviews") except: print( "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry" ) raise sentences = [] classes = [] path = datapath("movie_reviews") ids = list(path.rglob("*.txt")) random.shuffle(ids) for fd in ids: if "neg/" in str(fd): cls = "neg" else: cls = "pos" with fd.open() as fp: sentences.append(fp.read()) classes.append(cls) if max_examples and len(classes) >= max_examples: break return sentences, classes
def load(representation='onehot'): download("uci_cars") f = open(datapath("uci_cars") / "car.data", "r") X = [] y = [] for i in f.readlines(): clean_line = i.strip().split(",") temp = {} temp["buying"] = clean_line[0] temp["maint"] = clean_line[1] temp["doors"] = clean_line[2] temp["persons"] = clean_line[3] temp["lug_boot"] = clean_line[4] temp["safety"] = clean_line[5] X.append(temp) y.append(clean_line[6]) if representation == 'numeric': return _load_numeric(X, y) elif representation == 'onehot': return _load_onehot(X, y) raise ValueError("Invalid value for represenation: %s" % representation)
def load_raw(max_examples=None): """ Loads the train and test datasets for the [HAHA 2019 corpus](https://www.fing.edu.uy/inco/grupos/pln/haha/index.html#data) as Pandas dataframes. ##### Examples ```python >>> train, test = load_raw() >>> len(train), len(test) (24000, 6000) >>> train.columns Index(['id', 'text', 'is_humor', 'votes_no', 'votes_1', 'votes_2', 'votes_3', 'votes_4', 'votes_5', 'funniness_average'], dtype='object') >>> train["funniness_average"].mean() 2.0464498676235694 ``` """ download("haha_2019") train_df = pd.read_csv(datapath("haha_2019") / "haha_2019_train.csv") test_df = pd.read_csv(datapath("haha_2019") / "haha_2019_test_gold.csv") if max_examples is not None: train_df = train_df[:max_examples] test_df = test_df[:max_examples] return train_df, test_df
def load(): """ Loads train and valid datasets from [Gisette uci dataset](https://archive.ics.uci.edu/ml/datasets/Gisette). ##### Examples ```python >>> X_train, y_train, X_valid, y_valid = load() >>> X_train.shape, X_valid.shape ((6000, 5000), (1000, 5000)) >>> len(y_train), len(y_valid) (6000, 1000) ``` """ try: download("gisette") except: print( "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry" ) raise path = str(datapath(os.path.dirname( os.path.abspath(__file__)))) + "/data/gisette" train_data = open(os.path.join(path, "gisette_train.data"), "r") train_labels = open(os.path.join(path, "gisette_train.labels"), "r") valid_data = open(os.path.join(path, "gisette_valid.data"), "r") valid_labels = open(os.path.join(path, "gisette_valid.labels"), "r") Xtrain = sp.lil_matrix((6000, 5000)) ytrain = [] Xvalid = sp.lil_matrix((1000, 5000)) yvalid = [] for i, line in enumerate(train_data): for j, value in enumerate(line.split()): value = int(value) if value > 0: Xtrain[i, j] = value for i, line in enumerate(valid_data): for j, value in enumerate(line.split()): value = int(value) if value > 0: Xvalid[i, j] = value for line in train_labels: ytrain.append(int(line) > 0) for line in valid_labels: yvalid.append(int(line) > 0) return Xtrain.tocsr(), np.asarray(ytrain), Xvalid.tocsr(), np.asarray( yvalid)
def data_download(datasets:List[str]=typer.Argument(..., help="Name of one or more specific datasets to download, or 'all'.")): """ ⏬ Download a dataset. Pass a name to directly download that dataset. Otherwise, this command will show an interactive menu. """ if 'all' in datasets: datasets = get_datasets_list().keys() for dataset in datasets: download(dataset)
def load(representation="numeric"): """ Loads corpora from [ABALONE uci dataset](https://archive.ics.uci.edu/ml/datasets/Abalone). ##### Examples ```python >>> X, y = load() >>> X.shape (4177, 6047) >>> len(y) 4177 ``` """ try: download("abalone") except: print( "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry" ) raise f = open(datapath("abalone") / "abalone.data", "r") X = [] y = [] for i in f.readlines(): clean_line = i.strip().split(",") temp = {} temp["Sex"] = clean_line[0] temp["Length"] = clean_line[1] temp["Diameter"] = clean_line[2] temp["Height"] = clean_line[3] temp["Shucked weight"] = clean_line[4] temp["Whole weight"] = clean_line[5] temp["Viscera weight"] = clean_line[6] temp["Shell weight"] = clean_line[7] X.append(temp) y.append(clean_line[8]) if representation == "numeric": return _load_numeric(X, y) elif representation == "onehot": return _load_onehot(X, y) raise ValueError("Invalid value for represenation: %s" % representation)
def load(training_batches=5): """ Load the CIFAR-10 dataset ##### Parameters * 'training_batches': maximum number of batches to load for training, each batch has 10,000 examples (min=`1`, max=`5`, default=`5`). ##### Examples >>> X_train, y_train, X_test, y_test = load(training_batches=5) >>> X_train.shape (50000, 32, 32, 3) >>> len(y_train) 50000 >>> X_test.shape (10000, 32, 32, 3) >>> len(y_test) 10000 >>> y_train[0] 6 """ download("cifar10") X_train = [] y_train = [] for i in range(1, training_batches + 1): batch = datapath('cifar10') / f'data_batch_{i}' with open(batch, 'rb') as fp: data = pickle.load(fp, encoding='bytes') X_train.append(data[b'data']) y_train.extend(data[b'labels']) X_train = np.vstack(X_train) X_train = np.reshape(X_train, (-1, 3, 32, 32)).transpose(0, 2, 3, 1) test_batch = datapath('cifar10') / 'test_batch' with open(test_batch, 'rb') as fp: data = pickle.load(fp, encoding='bytes') X_test, y_test = data[b'data'], data[b'labels'] X_test = np.reshape(X_test, (-1, 3, 32, 32)).transpose(0, 2, 3, 1) return X_train, y_train, X_test, y_test
def load(): """ Loads corpora from [Yeast uci dataset](https://archive.ics.uci.edu/ml/datasets/Yeast). ##### Examples ```python >>> X, y = load() >>> X.shape (1484, 8) >>> len(y) 1484 ``` """ try: download("yeast") except: print( "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry" ) raise path = str(datapath(os.path.dirname( os.path.abspath(__file__)))) + "/data/yeast" f = open(os.path.join(path, "yeast.data"), "r") X = [] y = [] for i in f: clean_line = i.strip().split() temp = {} temp["1"] = float(clean_line[1]) temp["2"] = float(clean_line[2]) temp["3"] = float(clean_line[3]) temp["4"] = float(clean_line[4]) temp["5"] = float(clean_line[5]) temp["6"] = float(clean_line[6]) temp["7"] = float(clean_line[7]) temp["8"] = float(clean_line[8]) X.append(temp) y.append(clean_line[9]) return _load_onehot(X, y)
def load(white=True, red=True, max_examples=None): if not red and not white: raise ValueError("Either red or white must be selected") download("wine_quality") f_white = open(datapath("wine_quality") / "winequality-white.csv", "r") f_red = open(datapath("wine_quality") / "winequality-red.csv", "r") X = [] y = [] if white: title_line = True for i in f_white.readlines(): if max_examples and len(X) >= max_examples: break if title_line == True: title_line = False continue clean_line = i.strip().split(";") X.append([1, 0] + [float(i) for i in clean_line[:-1]]) y.append(float(clean_line[-1])) if red: title_line = True for i in f_red.readlines(): if max_examples and len(X) >= max_examples: break if title_line == True: title_line = False continue clean_line = i.strip().split(";") X.append([0, 1] + [float(i) for i in clean_line[:-1]]) y.append(float(clean_line[-1])) return np.asarray(X), np.asarray(y)
def load(): """ Loads train and valid datasets from [DOROTHEA uci dataset](https://archive.ics.uci.edu/ml/datasets/dorothea). ##### Examples ```python >>> X_train, y_train, X_valid, y_valid = load() >>> X_train.shape, X_valid.shape ((800, 100000), (350, 100000)) >>> len(y_train), len(y_valid) (800, 350) ``` """ download("dorothea") train_data = open(datapath('dorothea') / "dorothea_train.data", "r") train_labels = open(datapath('dorothea') / "dorothea_train.labels", "r") valid_data = open(datapath('dorothea') / "dorothea_valid.data", "r") valid_labels = open(datapath('dorothea') / "dorothea_valid.labels", "r") Xtrain = sp.lil_matrix((800, 100000), dtype=int) ytrain = [] Xvalid = sp.lil_matrix((350, 100000), dtype=int) yvalid = [] for row, line in enumerate(train_data): for col in line.split(): Xtrain[row, int(col) - 1] = 1 for row, line in enumerate(valid_data): for col in line.split(): Xvalid[row, int(col) - 1] = 1 for line in train_labels: ytrain.append(int(line)) for line in valid_labels: yvalid.append(int(line)) return Xtrain.tocsr(), np.asarray(ytrain), Xvalid.tocsr(), np.asarray( yvalid)
def load(max_examples=None): download("german_credit") f = open(datapath("german_credit") / "german.data", "r") X = [] y = [] for i in f.readlines(): if max_examples and len(X) >= max_examples: break clean_line = i.strip().split() line = {'feature_%i'% i : _parse(v) for i,v in enumerate(clean_line[:-1])} X.append(line) y.append(int(clean_line[-1]) == 2) return DictVectorizer(sparse=False).fit_transform(X), np.asarray(y)
def load(max_examples=None): """ Loads train and test datasets from [MEDDOCAN iberleaf 2018](https://github.com/PlanTL-SANIDAD/SPACCC_MEDDOCAN). ##### Examples ```python >>> X_train, y_train, X_valid, y_valid = load() >>> len(X_train), len(X_valid) (25622, 8432) >>> len(y_train), len(y_valid) (25622, 8432) ``` """ try: download("meddocan_2018") except: print( "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry" ) raise train_path = datapath("meddocan_2018") / "train/brat" dev_path = datapath("meddocan_2018") / "dev/brat" test_path = datapath("meddocan_2018") / "test/brat" X_train = [] X_test = [] y_train = [] y_test = [] total = 0 success = 0 failed = 0 for file in os.scandir(train_path): if file.name.split(".")[1] == "ann": text, phi = parse_text_and_tags(file.path) brat_corpora, text, ibo_corpora = get_tagged_tokens(text, phi) if compare_tags(brat_corpora, phi): X_train.extend(text) y_train.extend(ibo_corpora) for file in os.scandir(dev_path): if file.name.split(".")[1] == "ann": text, phi = parse_text_and_tags(file.path) brat_corpora, text, ibo_corpora = get_tagged_tokens(text, phi) if compare_tags(brat_corpora, phi): X_train.extend(text) y_train.extend(ibo_corpora) for file in os.scandir(test_path): if file.name.split(".")[1] == "ann": text, phi = parse_text_and_tags(file.path) brat_corpora, text, ibo_corpora = get_tagged_tokens(text, phi) if compare_tags(brat_corpora, phi): X_test.extend(text) y_test.extend(ibo_corpora) if max_examples is not None: X_train = X_train[:max_examples] X_test = X_test[:max_examples] y_train = y_train[:max_examples] y_test = y_test[:max_examples] return X_train, y_train, X_test, y_test
def load(max_examples=None): """ Loads train and valid datasets from [Shuttle uci dataset](https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle)). ##### Examples ```python >>> X_train, y_train, X_valid, y_valid = load() >>> X_train.shape, X_valid.shape ((43500, 9), (14500, 9)) >>> len(y_train), len(y_valid) (43500, 14500) ``` """ try: download("shuttle") except: print( "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry" ) raise train_data = open(datapath("shuttle") / "shuttle.trn", "r") test_data = open(datapath("shuttle") / "shuttle.tst", "r") X_train = [] X_test = [] y_train = [] y_test = [] for i in train_data.readlines(): clean_line = i.strip().split() temp = {} temp["1"] = int(clean_line[0]) temp["2"] = int(clean_line[1]) temp["3"] = int(clean_line[2]) temp["4"] = int(clean_line[3]) temp["5"] = int(clean_line[4]) temp["6"] = int(clean_line[5]) temp["7"] = int(clean_line[6]) temp["8"] = int(clean_line[7]) temp["9"] = int(clean_line[8]) X_train.append(temp) y_train.append(clean_line[9]) if max_examples and len(X_train) >= max_examples: break for i in test_data.readlines(): clean_line = i.strip().split() temp = {} temp["1"] = int(clean_line[0]) temp["2"] = int(clean_line[1]) temp["3"] = int(clean_line[2]) temp["4"] = int(clean_line[3]) temp["5"] = int(clean_line[4]) temp["6"] = int(clean_line[5]) temp["7"] = int(clean_line[6]) temp["8"] = int(clean_line[7]) temp["9"] = int(clean_line[8]) X_test.append(temp) y_test.append(clean_line[9]) if max_examples and len(X_test) >= max_examples: break X_train, y_train = _load_onehot(X_train, y_train) X_test, y_test = _load_onehot(X_test, y_test) return X_train, y_train, X_test, y_test