def test_csv_multi_threaded_reader_output(self):
        """
        Check if multi-threaded and single threaded readers produce the correct
        output.
        """
        data_paths = ['data/small_chunks/chunk1.csv',
                      'data/small_chunks/chunk2.csv',
                      'data/small_chunks/chunk3.csv']
        chunk_size = 2

        reader = CsvReader(chunk_size=chunk_size, worker_threads_num=3)

        expected_data = read_data_from_csv_file(data_paths)

        actual_data_chunks = DataChunk()
        for data_chunk in reader.iter(data_path=data_paths):
            for key in data_chunk.keys():
                if key not in actual_data_chunks:
                    actual_data_chunks[key] = np.array([])
                actual_data_chunks[key] = np.concatenate([
                                                        actual_data_chunks[key],
                                                        data_chunk[key]
                                                        ])
        self.compare_unsorted_data_chunks(dc1=expected_data,
                                          dc2=actual_data_chunks,
                                          sort_key='id')
Ejemplo n.º 2
0
    def test_creation(self):
        data_path = 'mldp/tests/data/small_chunks/'
        data_source = {"data_path": data_path}
        vocab = Vocabulary(self.reader)
        vocab.create(data_source, "first_name")

        data = read_data_from_csv_file(get_file_paths(data_path))
        unique_first_names = np.unique(data['first_name'])

        for ufn in unique_first_names:
            self.assertTrue(ufn in vocab)
    def test_csv_reader_output(self):
        """Checking if read data-chunks are valid."""
        data_path = 'data/small_chunks/chunk2.csv'
        chunk_size = 2

        reader = CsvReader(chunk_size=chunk_size, worker_threads_num=1)

        data = read_data_from_csv_file(data_path)
        expected_chunks = create_list_of_data_chunks(data,
                                                     chunk_size=chunk_size)

        itr = reader.iter(data_path=data_path)
        i = 0
        for (actual_chunk, expected_chunk) in izip(itr, expected_chunks):
            self.assertTrue(actual_chunk == expected_chunk)
            i += 1

        self.assertTrue(i == len(expected_chunks))
Ejemplo n.º 4
0
    def test_2D_padding(self):
        """
        Testing if padding works correctly for common scenarios of 2D data
        (batch_size x sequences).
        Specifically testing whether it produces proper padded sequences, and
        their masks. Also, testing if when symbol_to_mask is provided if it
        correctly masks those symbols.
        """
        field_names = ["text"]
        mask_field_names = ['text_mask']
        data_path = "mldp/tests/data/news.csv"
        pad_symbol = "<PAD>"
        mask_field_name_suffix = "mask"
        padding_modes = ['left', 'right', 'both']
        symbols_to_mask = ["The", "a", "to", "as"]
        axis = 1

        data_chunk = read_data_from_csv_file(data_path, sep="\t")

        # tokenize field values
        for fn in field_names:
            data_chunk[fn] = np.array([seq.split() for seq in data_chunk[fn]])

        for padding_mode, symbol_to_mask in product(padding_modes,
                                                    symbols_to_mask):
            padder = Padder(field_names,
                            pad_symbol=pad_symbol,
                            new_mask_fname=mask_field_names,
                            padding_mode=padding_mode,
                            axis=axis,
                            symbol_to_mask=symbol_to_mask)
            padded_data_chunk = padder(copy.deepcopy(data_chunk))

            for fn, mask_fn in zip(field_names, mask_field_names):
                padded_fv = padded_data_chunk[fn]
                mask = padded_data_chunk[mask_fn]
                original_fv = data_chunk[fn]

                self.assertTrue(len(padded_fv.shape) == 2)
                self._test_padded_values(original_field_values=original_fv,
                                         padded_field_values=padded_fv,
                                         mask=mask,
                                         pad_symbol=pad_symbol,
                                         symbol_to_mask=symbol_to_mask)
Ejemplo n.º 5
0
    def test_vocabulary_mapper(self):
        """Testing whether the mapper allows to map back and forth field values.
        """
        data_path = 'mldp/tests/data/mock_data.csv'
        target_fields = ["first_name", "last_name", "email", "gender"]

        reader = CsvReader(sep=',')
        vocab = Vocabulary(reader)

        for target_field in target_fields:
            vocab.create(data_source={"data_path": data_path},
                         data_fnames=target_field)

            data = read_data_from_csv_file(data_path)
            data_original = copy.deepcopy(data)

            mapper_to = VocabMapper({target_field: vocab}, "id")
            mapper_back = VocabMapper({target_field: vocab}, "token")

            data = mapper_to(data)
            data = mapper_back(data)

            self.assertTrue(
                (data[target_field] == data_original[target_field]).all())
Ejemplo n.º 6
0
def read_csv_data(path):
    dataset = read_data_from_csv_file(path, sep=',')
    for k, v in dataset.items():
        if k != 'id':
            dataset[k] = v.astype('str')
    return dataset