Esempio n. 1
0
def main():
    logger = logging.getLogger(__name__)
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--filename", default='dataset/xiaohuangji50w_nofenci.conv', type=str)
    parser.add_argument("-o", "--output_dir", default='data', type=str)
    parser.add_argument("--max_seq_len", default=10, type=int)
    parser.add_argument("--max_vocab_size", default=6500, type=int)

    args = parser.parse_args()
    logger.info("Loading raw data...")
    pairs = load_raw_data(args.filename)

    logger.info("Building word dict...")
    word_to_ix = create_word_to_ix(pairs, args.max_seq_len, args.max_vocab_size)
    logger.info(f"Vocab size: {len(word_to_ix)}")

    logger.info("Building tensor-format dataset...")
    queries, replies, lens = create_train_dataset(
        pairs, word_to_ix, args.max_seq_len)

    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    logger.info("Saving...")
    save_word_dict(word_to_ix, output_dir)
    save_dataset(queries, replies, lens, output_dir)

    logger.info("All Done!")
Esempio n. 2
0
def create_data_set_as_np_array(folder_path,
                                npy_path,
                                npy_name="data",
                                height=120,
                                width=160,
                                channels=3,
                                resize=100,
                                verbose=True):
    """
    Giving one path to a folder of folders of images,
    this function transform all images in two arrays
    one with all the flatted images 'npy_name'_<np.shape>_data.npy
    and other with all the respective labels 'npy_name'_<np.shape>_labels.npy
    both saved in 'npy_path'.

    :param folder_path: path to folder containing folders of images
                        and pickles
    :type folder_path: str
    :param npy_path: name of the data and labels array to be saved
    :type npy_path: str
    :param npy_name: path to data and labels array to be saved
    :type npy_name: str
    :param height: image height
    :type heights: int
    :param width: image width
    :type width: int
    :param channels: image channels
    :type channels: int
    :param resize: percentage to scale the image
    :type resize: int
    :param verbose: param to print path information
    :type verbose: bool
    """
    assert os.path.exists(folder_path)
    all_images = []
    all_labels = []
    for folder in os.listdir(folder_path):
        folder = os.path.join(folder_path, folder)
        if os.path.isdir(folder):
            pickle_path = folder + "_pickle"
            images, labels, shape = folder2array(folder,
                                                 pickle_path,
                                                 height,
                                                 width,
                                                 channels,
                                                 resize,
                                                 verbose)
            all_images.append(images)
            all_labels.append(labels)
    all_images = np.concatenate(all_images, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    all_labels = all_labels.reshape((all_labels.shape[0], 1))
    save_dataset(all_images, all_labels, npy_path, shape, npy_name)
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('data_path', type=str, help='path to current data')
    parser.add_argument('labels_path', type=str, help='path to current labels')
    parser.add_argument('new_data_folder_path',
                        type=str,
                        help='path to data and labels to be saved')  # noqa
    parser.add_argument('dataset_name',
                        nargs='?',
                        default='dataset',
                        type=str,
                        help='name for dataset. (Default) dataset')  # noqa

    user_args = parser.parse_args()

    data, labels = load_dataset(user_args.data_path, user_args.labels_path)
    data, labels = extend_dataset_flip_axis(data, labels)
    data, labels = dataset_augmentation(data, labels)
    data_shape = (90, 160, 3)
    save_dataset(data, labels, user_args.new_data_folder_path, data_shape,
                 user_args.dataset_name)
Esempio n. 4
0
def main():
    logger = logging.getLogger(__name__)
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--input_dir", default='dataset', type=str)
    parser.add_argument("-o", "--output_dir", default='data', type=str)
    parser.add_argument("--max_seq_len", default=10, type=int)
    parser.add_argument("--max_vocab_size", default=8000, type=int)
    args = parser.parse_args()

    path = Path(args.input_dir)
    corpus_dir = 'cornell movie-dialogs corpus'
    lines_filename = path / corpus_dir / 'movie_lines.txt'
    conversations_filename = path / corpus_dir / 'movie_conversations.txt'

    logger.info("Loading raw data and extracting sentence pairs...")
    lines = load_lines(lines_filename)
    conversations = load_conversations(conversations_filename)
    pairs = extract_sentence_pairs(conversations, lines)

    logger.info("Building word dict...")
    word_to_ix = create_word_to_ix(pairs, args.max_seq_len,
                                   args.max_vocab_size)

    logger.info(f"Vocab size: {len(word_to_ix)}")

    logger.info("Building tensor-format dataset...")
    queries, replies, lens = create_train_dataset(pairs, word_to_ix,
                                                  args.max_seq_len)

    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    logger.info("Saving...")
    save_word_dict(word_to_ix, output_dir)
    save_dataset(queries, replies, lens, output_dir)

    logger.info("All Done!")
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('data_path', type=str, help='path to current data')
    parser.add_argument('labels_path', type=str, help='path to current labels')
    parser.add_argument('new_data_folder_path',
                        type=str,
                        help='path to data and labels to be saved')
    parser.add_argument('dataset_name',
                        default='dataset',
                        type=str,
                        help='name for dataset. (Default) dataset')
    parser.add_argument("-he",
                        "--image_height",
                        type=int,
                        default=120,
                        help="original height number (default=120)")
    parser.add_argument("-w",
                        "--image_width",
                        type=int,
                        default=160,
                        help="original width number (default=160)")
    parser.add_argument("-c",
                        "--image_channels",
                        type=int,
                        default=3,
                        help="number of channels (default=3)")
    parser.add_argument('-b',
                        '--binarize',
                        action='store_true',
                        help='flag to binarize the dataset (default=False)')
    parser.add_argument(
        '-ctb',
        '--cut_top_bottom',
        action='store_true',
        help=
        'flag to cut the top and bottom of the dataset images, resizing to their original shape (default=False)'
    )
    parser.add_argument('-g',
                        '--grayscale',
                        action='store_true',
                        help='flag to grayscale the dataset (default=False)')
    parser.add_argument(
        '-gr',
        '--green_channel',
        action='store_true',
        help=
        'flag to create the dataset with only its green channel (default=False)'
    )
    parser.add_argument(
        '-x',
        '--extend_dataset',
        action='store_true',
        help=
        'flag to extend the dataset by flipping its horizontal axis in left/right labeled images (default=False)'
    )
    user_args = parser.parse_args()
    args_list = [
        user_args.binarize, user_args.green_channel, user_args.grayscale
    ]
    assert sum(
        args_list) <= 1, "Multiple flags selected for image manipulaiton"
    data, labels = load_dataset(user_args.data_path, user_args.labels_path)
    print("After load: data shape {}, labels shape {}".format(
        data.shape, labels.shape))
    if user_args.extend_dataset:
        data, labels = extend_dataset_flip_axis(
            data,
            labels,
            height=user_args.image_height,
            width=user_args.image_width,
            channels=user_args.image_channels)
        print("After extension: data shape {}, labels shape {}".format(
            data.shape, labels.shape))
    data_shape = (user_args.image_height, user_args.image_width,
                  user_args.image_channels)
    if user_args.cut_top_bottom:
        data, data_shape = cut_top_bottom_dataset(
            data,
            height=user_args.image_height,
            width=user_args.image_width,
            channels=user_args.image_channels)
    if user_args.binarize:
        data, data_shape = binarize_dataset(data,
                                            height=user_args.image_height,
                                            width=user_args.image_width,
                                            channels=user_args.image_channels)
    if user_args.grayscale:
        data, data_shape = gray_dataset(data,
                                        height=user_args.image_height,
                                        width=user_args.image_width,
                                        channels=user_args.image_channels)
    if user_args.green_channel:
        data, data_shape = green_dataset(data,
                                         height=user_args.image_height,
                                         width=user_args.image_width,
                                         channels=user_args.image_channels)
    #data, labels = dataset_augmentation(data, labels)
    if user_args.cut_top_bottom or user_args.binarize or user_args.green_channel or user_args.grayscale:
        print("After transformation: data shape {}, labels shape {}".format(
            data.shape, labels.shape))
    save_dataset(data, labels, user_args.new_data_folder_path, data_shape,
                 user_args.dataset_name)
Esempio n. 6
0
        docs, labels = process_kaggle(index_path=index_path)
        return docs, labels

    if dataset == 'enron':
        index_path = 'data/enron/emails.csv'
        docs, labels = process_enron(filename=index_path)
        return docs, labels


# choose a dataset
#dataname = 'trec07'
dataname = 'enron'
model_name = 'kaggle'
docs, labels = load_clean_dataset(dataset=dataname)
print([docs[i] for i in range(min(len(docs), 10))])
util.save_dataset([docs, labels], file_identifier=dataname, prefix='docs')

if model_name:
    # use existing tokenizer for model <model_name>
    (tokenizer, length) = util.load_dataset(file_identifier=model_name,
                                            prefix='tokenizer')
    trainX, tokenizer, length = util.pre_process(docs,
                                                 tokenizer=tokenizer,
                                                 length=length)
else:
    # generate new tokenizer
    trainX, tokenizer, length = util.pre_process(docs)
    util.save_dataset([tokenizer, length],
                      file_identifier=dataname,
                      prefix='tokenizer')