def main(): logger = logging.getLogger(__name__) parser = argparse.ArgumentParser() parser.add_argument("-f", "--filename", default='dataset/xiaohuangji50w_nofenci.conv', type=str) parser.add_argument("-o", "--output_dir", default='data', type=str) parser.add_argument("--max_seq_len", default=10, type=int) parser.add_argument("--max_vocab_size", default=6500, type=int) args = parser.parse_args() logger.info("Loading raw data...") pairs = load_raw_data(args.filename) logger.info("Building word dict...") word_to_ix = create_word_to_ix(pairs, args.max_seq_len, args.max_vocab_size) logger.info(f"Vocab size: {len(word_to_ix)}") logger.info("Building tensor-format dataset...") queries, replies, lens = create_train_dataset( pairs, word_to_ix, args.max_seq_len) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) logger.info("Saving...") save_word_dict(word_to_ix, output_dir) save_dataset(queries, replies, lens, output_dir) logger.info("All Done!")
def create_data_set_as_np_array(folder_path, npy_path, npy_name="data", height=120, width=160, channels=3, resize=100, verbose=True): """ Giving one path to a folder of folders of images, this function transform all images in two arrays one with all the flatted images 'npy_name'_<np.shape>_data.npy and other with all the respective labels 'npy_name'_<np.shape>_labels.npy both saved in 'npy_path'. :param folder_path: path to folder containing folders of images and pickles :type folder_path: str :param npy_path: name of the data and labels array to be saved :type npy_path: str :param npy_name: path to data and labels array to be saved :type npy_name: str :param height: image height :type heights: int :param width: image width :type width: int :param channels: image channels :type channels: int :param resize: percentage to scale the image :type resize: int :param verbose: param to print path information :type verbose: bool """ assert os.path.exists(folder_path) all_images = [] all_labels = [] for folder in os.listdir(folder_path): folder = os.path.join(folder_path, folder) if os.path.isdir(folder): pickle_path = folder + "_pickle" images, labels, shape = folder2array(folder, pickle_path, height, width, channels, resize, verbose) all_images.append(images) all_labels.append(labels) all_images = np.concatenate(all_images, axis=0) all_labels = np.concatenate(all_labels, axis=0) all_labels = all_labels.reshape((all_labels.shape[0], 1)) save_dataset(all_images, all_labels, npy_path, shape, npy_name)
def main(): parser = argparse.ArgumentParser() parser.add_argument('data_path', type=str, help='path to current data') parser.add_argument('labels_path', type=str, help='path to current labels') parser.add_argument('new_data_folder_path', type=str, help='path to data and labels to be saved') # noqa parser.add_argument('dataset_name', nargs='?', default='dataset', type=str, help='name for dataset. (Default) dataset') # noqa user_args = parser.parse_args() data, labels = load_dataset(user_args.data_path, user_args.labels_path) data, labels = extend_dataset_flip_axis(data, labels) data, labels = dataset_augmentation(data, labels) data_shape = (90, 160, 3) save_dataset(data, labels, user_args.new_data_folder_path, data_shape, user_args.dataset_name)
def main(): logger = logging.getLogger(__name__) parser = argparse.ArgumentParser() parser.add_argument("-f", "--input_dir", default='dataset', type=str) parser.add_argument("-o", "--output_dir", default='data', type=str) parser.add_argument("--max_seq_len", default=10, type=int) parser.add_argument("--max_vocab_size", default=8000, type=int) args = parser.parse_args() path = Path(args.input_dir) corpus_dir = 'cornell movie-dialogs corpus' lines_filename = path / corpus_dir / 'movie_lines.txt' conversations_filename = path / corpus_dir / 'movie_conversations.txt' logger.info("Loading raw data and extracting sentence pairs...") lines = load_lines(lines_filename) conversations = load_conversations(conversations_filename) pairs = extract_sentence_pairs(conversations, lines) logger.info("Building word dict...") word_to_ix = create_word_to_ix(pairs, args.max_seq_len, args.max_vocab_size) logger.info(f"Vocab size: {len(word_to_ix)}") logger.info("Building tensor-format dataset...") queries, replies, lens = create_train_dataset(pairs, word_to_ix, args.max_seq_len) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) logger.info("Saving...") save_word_dict(word_to_ix, output_dir) save_dataset(queries, replies, lens, output_dir) logger.info("All Done!")
def main(): parser = argparse.ArgumentParser() parser.add_argument('data_path', type=str, help='path to current data') parser.add_argument('labels_path', type=str, help='path to current labels') parser.add_argument('new_data_folder_path', type=str, help='path to data and labels to be saved') parser.add_argument('dataset_name', default='dataset', type=str, help='name for dataset. (Default) dataset') parser.add_argument("-he", "--image_height", type=int, default=120, help="original height number (default=120)") parser.add_argument("-w", "--image_width", type=int, default=160, help="original width number (default=160)") parser.add_argument("-c", "--image_channels", type=int, default=3, help="number of channels (default=3)") parser.add_argument('-b', '--binarize', action='store_true', help='flag to binarize the dataset (default=False)') parser.add_argument( '-ctb', '--cut_top_bottom', action='store_true', help= 'flag to cut the top and bottom of the dataset images, resizing to their original shape (default=False)' ) parser.add_argument('-g', '--grayscale', action='store_true', help='flag to grayscale the dataset (default=False)') parser.add_argument( '-gr', '--green_channel', action='store_true', help= 'flag to create the dataset with only its green channel (default=False)' ) parser.add_argument( '-x', '--extend_dataset', action='store_true', help= 'flag to extend the dataset by flipping its horizontal axis in left/right labeled images (default=False)' ) user_args = parser.parse_args() args_list = [ user_args.binarize, user_args.green_channel, user_args.grayscale ] assert sum( args_list) <= 1, "Multiple flags selected for image manipulaiton" data, labels = load_dataset(user_args.data_path, user_args.labels_path) print("After load: data shape {}, labels shape {}".format( data.shape, labels.shape)) if user_args.extend_dataset: data, labels = extend_dataset_flip_axis( data, labels, height=user_args.image_height, width=user_args.image_width, channels=user_args.image_channels) print("After extension: data shape {}, labels shape {}".format( data.shape, labels.shape)) data_shape = (user_args.image_height, user_args.image_width, user_args.image_channels) if user_args.cut_top_bottom: data, data_shape = cut_top_bottom_dataset( data, height=user_args.image_height, width=user_args.image_width, channels=user_args.image_channels) if user_args.binarize: data, data_shape = binarize_dataset(data, height=user_args.image_height, width=user_args.image_width, channels=user_args.image_channels) if user_args.grayscale: data, data_shape = gray_dataset(data, height=user_args.image_height, width=user_args.image_width, channels=user_args.image_channels) if user_args.green_channel: data, data_shape = green_dataset(data, height=user_args.image_height, width=user_args.image_width, channels=user_args.image_channels) #data, labels = dataset_augmentation(data, labels) if user_args.cut_top_bottom or user_args.binarize or user_args.green_channel or user_args.grayscale: print("After transformation: data shape {}, labels shape {}".format( data.shape, labels.shape)) save_dataset(data, labels, user_args.new_data_folder_path, data_shape, user_args.dataset_name)
docs, labels = process_kaggle(index_path=index_path) return docs, labels if dataset == 'enron': index_path = 'data/enron/emails.csv' docs, labels = process_enron(filename=index_path) return docs, labels # choose a dataset #dataname = 'trec07' dataname = 'enron' model_name = 'kaggle' docs, labels = load_clean_dataset(dataset=dataname) print([docs[i] for i in range(min(len(docs), 10))]) util.save_dataset([docs, labels], file_identifier=dataname, prefix='docs') if model_name: # use existing tokenizer for model <model_name> (tokenizer, length) = util.load_dataset(file_identifier=model_name, prefix='tokenizer') trainX, tokenizer, length = util.pre_process(docs, tokenizer=tokenizer, length=length) else: # generate new tokenizer trainX, tokenizer, length = util.pre_process(docs) util.save_dataset([tokenizer, length], file_identifier=dataname, prefix='tokenizer')