def write_csv(self, path=None, mode='a'): """ Write the collected books information to a given CSV file Append if the file already exists Parameters ---------- path : str (default is the category_name) The path including the file name (without the extension to the csv) mode : str (default is 'a') The file mode used to open the file (r,r+,w,w+,a,a+,x,x+) """ if self.books == []: self.collect() if path is None: path = self.name.lower().replace(' ', '_') fields = self.books[0].get_headers() headers = {fields[i]: fields[i] for i in range(len(fields))} if not os.path.exists(f'{path}.csv') or (mode != 'a' and mode != 'a+'): FileIO.write(path, fields, headers, mode) for book in self.books: FileIO.write(path, fields, book.to_dict(), 'a')
def __scrap_books(self): books = [] book = Book(self.links[0][0]).collect() FileIO.open_category(self.name) for link in self.links: progress_monitor.catbooks_update( len(books), self.num_books, link[1]) book = Book(link[0]) book.collect() books.append(book) if self.dl_image: book.save_image() progress_monitor.catbooks_update( len(books), self.num_books, link[1]) FileIO.close_category() return books
def test_open_category(self): catname = 'testcat' FileIO.open_category(catname) assert getcwd() == urljoin(self.cwd+'/', catname) chdir('..') assert path.exists(catname) is True rmdir(catname)
def test_init_root(self): dirname = 'testinit' assert getcwd() == self.cwd FileIO.init_root(dirname, False) assert getcwd() == urljoin(self.cwd+'/', dirname) chdir('..') assert path.exists(dirname) is True rmdir(dirname)
def test_close_category(self): dirname = "testclose" assert getcwd() == self.cwd mkdir(dirname) chdir(dirname) FileIO.close_category() assert getcwd() == self.cwd rmdir(dirname)
def scan(self, ctx, prev_num): self.compute_stats() # # Check if we have encountered this file during this scan already # ctx.num_visited_files_reporter.increment(1) ctx.current_scanned_file_reporter.set(self.path()) if self.scan_hlink(ctx): logging.info("File %s: HLINK" % self.path()) return # # Check if the file is the same as in one of the upper levels # if self.scan_prev(ctx, prev_num): logging.debug("File %s: PREV" % self.path()) ctx.num_prev_files_reporter.increment(1) return # --- File not yet in database, process it file_size = 0 packer = PackerStream.PackerOStream(self.backup, Container.CODE_DATA) handle = open(self.path(), "rb") for data in FileIO.read_blocks(handle, self.backup.get_block_size()): packer.write(data) file_size += len(data) ctx.num_total_blocks_reporter.increment(1) ctx.size_total_blocks_reporter.increment(len(data)) ctx.update_scan_status() handle.close() self.digest = packer.get_digest() self.level = packer.get_level() self.update_hlink(ctx) logging.info("Scanned file %s size:%d new_blocks:%d new_blocks_size:%d" % (self.path(), file_size, packer.get_num_new_blocks(), packer.get_size_new_blocks())) ctx.num_scanned_files_reporter.increment(1) if packer.get_num_new_blocks() != 0: ctx.num_new_blocks_reporter.increment(packer.get_num_new_blocks()) ctx.size_new_blocks_reporter.increment(packer.get_size_new_blocks()) ctx.num_changed_files_reporter.increment(1) ctx.changed_files_reporter.append(self.path()) if file_size > 256 * 1024: logging.debug("File %s is big enough to register in cndb" % self.path()) cndb = self.backup.get_completed_nodes_db() assert self.stats is not None path_digest = Digest.dataDigest(self.path().encode('utf8')) encoded = (self.digest + IntegerEncodings.binary_encode_int_varlen(self.level) + IntegerEncodings.binary_encode_int_varlen(self.get_type()) + serialize_stats(self.get_stats())) if not cndb.has_key(path_digest) or cndb[path_digest] != encoded: cndb[path_digest] = encoded
def restore(self, ctx): """ Recreate the data from the information stored in the backup """ logging.info("Restoring " + self.path()) # # Check if the file has already been processed # during this pass # if self.restore_hlink(ctx): return # # No, this file is new. Create it. # packer = PackerStream.PackerIStream(self.backup, self.digest, self.level) file = open(self.path(), "wb") for data in FileIO.read_blocks(packer, Digest.dataDigestSize()): #print "File", self.path(), "reading digest", # base64.b64encode(digest) file.write(data) file.close() self.restore_stats()
def collect(self): """ Connect to the home-page and grab the information """ self._soup = FileIO.connect_with_bs4(self.site_url) self.num_books = self.__scrap_num_books() self.links = self.__scrap_links() self.categories = self.__scrap_categories()
def collect(self): """ Connect to the category page and grab the information """ self._soup = FileIO.connect_with_bs4(self.category_url) self.name = self.__scrap_name() self.num_books = self.__scrap_num_books() self.links = self.__scrap_links() self.books = self.__scrap_books()
def retrieve(self, stream): """ Recreate the data from the information stored in the backup into the given stream """ logging.info("Retrieving file " + self.path()) packer = PackerStream.PackerIStream(self.backup, self.digest, self.level) for data in FileIO.read_blocks(packer, Digest.dataDigestSize()): stream.write(data)
def test(self, ctx): """ Test that loading the data from the storages is successful """ logging.info("Testing " + self.path()) packer = PackerStream.PackerIStream(self.backup, self.digest, self.level) for data in FileIO.read_blocks(packer, Digest.dataDigestSize()): # Do nothing with the data, just make sure it got loaded pass
def readfile(): logger.info("Hitting URL %s", request.url) rawContent = None filePath = request.args.get("file_path") error, response = FileIO.readFile(filePath) if error: response = None logger.debug("Error: %s", error) logger.debug("Response: %s", response) return jsonify(response=response, error=error)
def collect(self): """ Connect to the product page and grab the information """ self._soup = FileIO.connect_with_bs4(self.product_page_url) self.universal_product_code = self.__scrap_upc() self.title = self.__scrap_title() self.price_including_tax = self.__scrap_price_inc_tax() self.price_excluding_tax = self.__scrap_price_exc_tax() self.number_available = self.__scrap_number_available() self.product_description = self.__scrap_product_description() self.category = self.__scrap_category() self.review_rating = self.__scrap_review_rating() self.image_url = self.__scrap_image_url()
def __scrap_links(self): def get_links(soup): return soup.select('section a[title]') try: links = get_links(self._soup) page = 2 while(len(links) < self.num_books): base = urljoin(self.category_url, 'page-{}.html'.format(page)) soup = FileIO.connect_with_bs4(base) links.extend(get_links(soup)) page += 1 return [(urljoin(self.category_url, x.attrs['href']), x.attrs['title']) for x in links] except Exception: raise(Exception(f"Can't find the Book links ::\ \n{self.product_page_url}"))
def __scrap_categories(self, to_csv=False): FileIO.init_root('data', False) categories = [] progress_monitor.allbooks_init(self.num_books, self.site_url) for link in self.links: progress_monitor.category_update( len(categories), len(self.links), link[1]) category = Category(link[0]) categories.append(category) FileIO.open_category(category.name) category.write_csv() FileIO.close_category() return categories
def save_image(self): """ Copy the remote image in the current local directory """ self.image_local = self.__get_image_name() FileIO.download_image(self.image_url, self.image_local)
def test_connect_with_bs4_ERROR(): with pytest.raises(Exception): FileIO.connect_with_bs4('http://www.xxxfakexxx.xxx')
def dataset_read(source, target, batch_size, is_resize=False, leave_one_num=-1, dataset='NW', sensor_num=0): S_train = {} S_val = {} S_test = {} T_train = {} T_val = {} T_test = {} if 'NW' == dataset: x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \ x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \ FileIO.load_st_AB_mat(data_path = 'data/AB_dataset/AB_', X_dim = 4, is_resize = is_resize, leave_one_num = leave_one_num, sensor_num = sensor_num) elif 'UCI' == dataset: x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \ x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \ FileIO.load_UCI_mat(data_path = 'data/1_dataset_UCI_DSADS/Features/', feature_length = 6*45, X_dim = 4, is_resize = is_resize, leave_one_num = leave_one_num, sensor_num = sensor_num) S_train['imgs'] = x_s_train S_train['labels'] = y_s_train T_train['imgs'] = x_t_train T_train['labels'] = y_t_train # input target samples for both S_val['imgs'] = x_s_val S_val['labels'] = y_s_val T_val['imgs'] = x_t_val T_val['labels'] = y_t_val S_test['imgs'] = x_s_test S_test['labels'] = y_s_test T_test['imgs'] = x_t_test T_test['labels'] = y_t_test train_loader = UnalignedDataLoader() train_loader.initialize(S_train, T_train, batch_size, batch_size) # train_loader.initialize(T_train, S_train, batch_size, batch_size) data_train = train_loader.load_data() test_loader = UnalignedDataLoader() test_loader.initialize(S_val, T_val, batch_size, batch_size) # test_loader.initialize(T_val, S_val, batch_size, batch_size) data_val = test_loader.load_data() final_test_loader = UnalignedDataLoader() final_test_loader.initialize(S_test, T_test, batch_size, batch_size) # final_test_loader.initialize(T_test, S_test, batch_size, batch_size) data_test = final_test_loader.load_data() print('Target test shape: {}'.format(T_test['labels'].shape)) return data_train, data_val, data_test
# -*- coding: utf-8 -*- """ Created on Mon May 20 10:45:51 2019 @author: kuangen """ from utils import FileIO from utils import utils import mat4py as m4p import numpy as np from numpy import genfromtxt from sklearn.model_selection import train_test_split #%% Northwestern dataset idx_x = np.arange(0,368) FileIO.save_mat('0_dataset/AB_156_to_186_walking.mat', is_walking = True) #%% x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \ x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \ FileIO.load_st_AB_mat(data_path = 'data/AB_dataset/AB_', is_resize = True, leave_one_num = 1) #%% UCI DSADS datase # read data:[label,subjects,segments, time, sensors] x_mat, y_mat = FileIO.read_UCI_DSADS() FileIO.save_UCI_DSADS(x_mat, y_mat, file_path = 'data/1_dataset_UCI_DSADS/Raw/') #%% extract features and output data x_mat = utils.extract_UCI_features(x_mat) FileIO.save_UCI_DSADS(x_mat, y_mat, file_path = 'data/1_dataset_UCI_DSADS/Features/') #%% load UCI data x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \
def test_connect_with_bs4_TYPE(): url = 'http://books.toscrape.com' assert type(FileIO.connect_with_bs4(url)) == BeautifulSoup
def load(self, filepath): self._Q = FileIO.read_pkl(filepath)
def predict(): from utils import FileIO, Utils from word_process import WordProcess # Path to the data txt file on disk. path_base = '../data/' path_file = path_base + 'bytecup.corpus.train.0.50k.txt' fio = FileIO() word = WordProcess(path_base, is_model_load=False, is_dict_load=True) contents, titles = fio.load_from_json(path_file) total_size = len(titles) num_samples = int(total_size * 0.8) num_test = total_size - num_samples print('num samples:', num_samples, 'num tests:', num_test) max_encoder_seq_length = int(max([len(txt) for txt in contents])) + 2 max_decoder_seq_length = max([len(txt) for txt in titles]) + 2 print('max_lengths:', max_encoder_seq_length, ' ', max_decoder_seq_length) train_data = { 'contents': contents[0:num_samples], 'titles': titles[0:num_samples] } test_data = { 'contents': contents[num_samples:total_size], 'titles': titles[num_samples:total_size] } datasets = { 'train': TextData2(train_data, word.dic, train_len=max_encoder_seq_length, label_len=max_decoder_seq_length), 'val': TextData2(test_data, word.dic, train_len=max_encoder_seq_length, label_len=max_decoder_seq_length) } data_loads = { x: DataLoader(datasets[x], batch_size=batch_size, shuffle=True, num_workers=15) for x in ['train', 'val'] } encoder = Encoder3(voca_size=84031, embedd_size=128, hidden_size=256) decoder = AttnDecoder3(hidden_size=256, vocab_size=84031) if use_cuda: encoder.cuda() decoder.cuda() best_model = torch.load(path_base + './50k.1.best_model_wts') best_model = Utils().gpu_model_to_cpu_model(best_model) encoder.load_state_dict(best_model[0]) decoder.load_state_dict(best_model[1]) out = evaluate(encoder, decoder, datasets) file1 = open(path_base + '50k.1.predict', 'a') for i, o in enumerate(out): file1.write(str([word.dic[int(i)] for i in o.data[0]])) file1.write(str(test_data['titles'][i]) + '\n') file1.close() print('predict done!')
i, 0:len(content)] = content_vec[0:max_encoder_seq_length] decoder_input_data[i, 0:len(title)] = title_vec decoder_target_data[i, 0:len(title) - 1] = title_vec[1:len(title)] yield ([encoder_input_data, decoder_input_data], decoder_target_data) epochs = 50 # Number of epochs to train for. latent_dim = 256 # Latent dimensionality of the encoding space. GPUs = 2 num_encoder_tokens = 128 # Path to the data txt file on disk. path_base = '../data1/' # Vectorize the data. fio = FileIO() word = WordProcess(path_base, is_model_load=True) wv = word.wv contents = fio.list_read(path_base + 'bytecup.corpus.train.0.contents.txt', is_flatten=True, is_return=True) titles = fio.list_read(path_base + 'bytecup.corpus.train.0.titles.txt', is_flatten=False, is_return=True) total_size = len(titles) num_samples = int(total_size * 0.8) num_test = total_size - num_samples train_data = [contents[0:num_samples], titles[0:num_samples]] test_data = [contents[num_samples:total_size], titles[num_samples:total_size]]
cat1.write_csv('cat1') cat1.write_csv('cat1') progress_monitor.complete() elif(args.slide == 3): # play with Scraper class print("This runs the whole website scraping") print("You can check the generated files in demo/slide3") move_to_path('demo/slide3') site_url = 'http://books.toscrape.com' site = Scraper(site_url) progress_monitor.complete() elif(args.slide == 4): # play with FileIO class print("This scrape an image") print("You can check the generated files in demo/slide4") move_to_path('demo/slide4') image_url = 'http://books.toscrape.com/media/cache/a3/9e/a39e7c5c9fc61c2ae0f81116aa8cbb0e.jpg' FileIO.download_image(image_url, 'demo.jpg') else: # Scrap the website site_url = 'http://books.toscrape.com' site = Scraper(site_url) progress_monitor.complete()
def test_download_image(self): url = "http://books.toscrape.com/media/cache/c0/59/c05972805aa7201171b8fc71a5b00292.jpg" name = "testdownload.jpg" FileIO.download_image(url, name) assert path.exists(name) remove(name)
def test_write(self): filepath = 'testwrite' FileIO.write(filepath, ['a'], {'a': 'hello'}, 'w') assert path.exists(f"{filepath}.csv") is True remove(f"{filepath}.csv")
def load(self, filepath): self._Q1 = FileIO.read_pkl(filepath) self._Q2 = np.copy(self._Q1)
def traditional_har(dataset='UCI'): if 'UCI' == dataset: sub_num = 8 class_num = 19 feature_length = 6 sensor_num = 45 elif 'NW' == dataset: sub_num = 10 class_num = 7 acc_s_LDA = np.zeros(sub_num) acc_t_LDA = np.zeros(sub_num) acc_s_SVM = np.zeros(sub_num) acc_t_SVM = np.zeros(sub_num) acc_s_ANN = np.zeros(sub_num) acc_t_ANN = np.zeros(sub_num) for i in range(sub_num): # load UCI dataset if 'UCI' == dataset: x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \ x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \ FileIO.load_UCI_mat(data_path = 'data/1_dataset_UCI_DSADS/Features/', feature_length = feature_length*45, X_dim = 2, leave_one_num = i) x_s_train = x_s_train[:, 0:feature_length * sensor_num] x_s_val = x_s_val[:, 0:feature_length * sensor_num] x_s_test = x_s_test[:, 0:feature_length * sensor_num] x_t_train = x_t_train[:, 0:feature_length * sensor_num] x_t_val = x_t_val[:, 0:feature_length * sensor_num] x_t_test = x_t_test[:, 0:feature_length * sensor_num] # load NW dataset elif 'NW' == dataset: x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \ x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \ FileIO.load_st_AB_mat(data_path = 'data/AB_dataset/AB_', X_dim = 2, leave_one_num = i) # print(y_s_train.shape[0] + y_s_val.shape[0] + y_s_test.shape[0], # y_t_train.shape[0] + y_t_val.shape[0] + y_t_test.shape[0]) # LDA, no domain adaptation clf = LDA() clf.fit(x_s_train, y_s_train) y_s_test_pred = clf.predict(x_s_test) start = time.clock() for i in range(8): out_prediction = clf.predict(x_s_test[[i]]) end = time.clock() print('LDA: forward time for each segment:%.30f' % ((end - start) / 8.)) acc = accuracy_score(y_s_test, y_s_test_pred) print("LDA: source domain accuracy: %.2f%%" % acc) acc_s_LDA[i] = acc y_t_test_pred = clf.predict(x_t_test) acc = accuracy_score(y_t_test, y_t_test_pred) print("LDA: target domain accuracy: %.2f%%" % (acc)) acc_t_LDA[i] = acc # SVM, no domain adaptation clf = svm.LinearSVC(max_iter=5000) clf.fit(x_s_train, y_s_train) y_s_test_pred = clf.predict(x_s_test) start = time.clock() for i in range(8): out_prediction = clf.predict(x_s_test[[i]]) end = time.clock() print('SVM: forward time for each segment:%.30f' % ((end - start) / 8.)) acc = accuracy_score(y_s_test, y_s_test_pred) print("SVM: source domain accuracy: %.2f%%" % acc) acc_s_SVM[i] = acc y_t_test_pred = clf.predict(x_t_test) acc = accuracy_score(y_t_test, y_t_test_pred) print("SVM: target domain accuracy: %.2f%%" % (acc)) acc_t_SVM[i] = acc #%% ANN, no domain adaptation # load UCI dataset if 'UCI' == dataset: x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \ x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \ FileIO.load_UCI_mat(data_path = 'data/1_dataset_UCI_DSADS/Features/', is_one_hot = True, is_normalized = True, feature_length = feature_length*45, X_dim = 2, leave_one_num = i) # load NW dataset if 'NW' == dataset: x_s_train, y_s_train, x_s_val, y_s_val, x_s_test, y_s_test, \ x_t_train, y_t_train, x_t_val, y_t_val, x_t_test, y_t_test = \ FileIO.load_st_AB_mat(data_path = 'data/AB_dataset/AB_', X_dim = 2, is_one_hot = True, is_normalized = True, leave_one_num = i) clf = MLPClassifier(solver='sgd', activation='tanh', learning_rate='adaptive', learning_rate_init=0.1, hidden_layer_sizes=(10, class_num), max_iter=2000) clf.fit(x_s_train, y_s_train) y_s_test_pred = clf.predict(x_s_test) acc = accuracy_score(y_s_test, y_s_test_pred) start = time.clock() for i in range(8): out_prediction = clf.predict(x_s_test[[i]]) end = time.clock() print('ANN: forward time for each segment:%.30f' % ((end - start) / 8.)) print("ANN: source domain accuracy: %.2f%%" % acc) acc_s_ANN[i] = acc y_t_test_pred = clf.predict(x_t_test) acc = accuracy_score(y_t_test, y_t_test_pred) print("ANN: target domain accuracy: %.2f%%" % (acc)) acc_t_ANN[i] = acc print('LDA: mean of test acc in the source domain:', np.mean(acc_s_LDA)) print('LDA: mean of test acc in the target domain:', np.mean(acc_t_LDA)) print('SVM: mean of test acc in the source domain:', np.mean(acc_s_SVM)) print('SVM: mean of test acc in the target domain:', np.mean(acc_t_SVM)) print('ANN: mean of test acc in the source domain:', np.mean(acc_s_ANN)) print('ANN: mean of test acc in the target domain:', np.mean(acc_t_ANN)) return np.transpose(np.c_[acc_s_LDA, acc_t_LDA, acc_s_SVM, acc_t_SVM, acc_s_ANN, acc_t_ANN])
def load(self, filepath): self._Q = FileIO.read_pkl(filepath) self._pi.update_Q(self._Q) self._b.update_Q(self._Q)
def main(): from utils import FileIO from word_process import WordProcess # Path to the data txt file on disk. path_base = '../data/' path_file = path_base + 'bytecup.corpus.train.0.50k.txt' fio = FileIO() word = WordProcess(path_base, is_model_load=False, is_dict_load=True) dic = word.dic contents, titles = fio.load_from_json(path_file) total_size = len(titles) num_samples = int(total_size * 0.8) num_test = total_size - num_samples print('num samples:', num_samples, 'num tests:', num_test) max_encoder_seq_length = int(max([len(txt) for txt in contents])) + 2 max_decoder_seq_length = max([len(txt) for txt in titles]) + 2 print('max_lengths:', max_encoder_seq_length, ' ', max_decoder_seq_length) train_data = { 'contents': contents[0:num_samples], 'titles': titles[0:num_samples] } test_data = { 'contents': contents[num_samples:total_size], 'titles': titles[num_samples:total_size] } datasets = { 'train': TextData2(train_data, dic, train_len=max_encoder_seq_length, label_len=max_decoder_seq_length), 'val': TextData2(test_data, dic, train_len=max_encoder_seq_length, label_len=max_decoder_seq_length) } data_loads = { x: DataLoader(datasets[x], batch_size=batch_size, shuffle=True, num_workers=15) for x in ['train', 'val'] } encoder = Encoder3(voca_size=84031, embedd_size=128, hidden_size=256) decoder = AttnDecoder3(hidden_size=256, vocab_size=84031) optimizer = optim.SGD([{ 'params': encoder.parameters(), 'lr': 0.01 }, { 'params': decoder.parameters(), 'lr': 0.01 }], lr=0.01, momentum=0.9) lambda1 = lambda epoch: epoch // 30 lambda2 = lambda epoch: 0.95**epoch scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda2, lambda2]) criterion = {'loss': nn.CosineSimilarity(dim=2), 'acc': nn.MSELoss()} loss_history = HistoryLoss() train_model(encoder, decoder, data_loads, criterion, scheduler, loss_history)
def save(self, filepath): FileIO.dump_pkl(self._Q1, filepath)