def _get_test_input_function(self):
        """
        Inheriting class must implement this
        :return: callable
        """
        dataset = tf.data.Dataset.from_generator(
            self._yield_test_samples, (tf.float32, tf.bool, tf.bool),
            output_shapes=(TensorShape([
                Dimension(self._hparams.frames_per_sample),
                Dimension(self._hparams.neff)
            ]),
                           TensorShape([
                               Dimension(self._hparams.frames_per_sample),
                               Dimension(self._hparams.neff)
                           ]),
                           TensorShape([
                               Dimension(self._hparams.frames_per_sample),
                               Dimension(self._hparams.neff),
                               Dimension(2)
                           ])))

        dataset = dataset.map(
            self.feature_map_func,
            num_parallel_calls=self._hparams.num_parallel_calls)

        dataset = dataset.batch(batch_size=self._hparams.batch_size,
                                drop_remainder=True)
        dataset = dataset.prefetch(self._hparams.prefetch_size)
        dataset = dataset.cache(
            filename=os.path.join(self.iterator_dir, "test_data_cache"))
        print_info("Dataset output sizes are: ")
        print_info(dataset.output_shapes)
        return dataset
    def before_run(self, run_context):

        if self._path is None:
            self._path = os.path.join(os.path.expanduser("~"),
                                      "vitaFlow/runtime/GAN")

        global_step = run_context.session.process(self._global_Step)

        print_info("global_step {}".format(global_step))

        if global_step % self._store_interval_steps == 0:  #store every n steps
            samples = run_context.session.process(self._z_image)
            channel = self._z_image.get_shape()[-1]

            if channel == 1:
                images_grid = images_square_grid(samples, "L")
            else:
                images_grid = images_square_grid(samples, "RGB")

            if not os.path.exists(self._path):
                os.makedirs(self._path)

            images_grid.save(
                os.path.join(self._path, 'step_{}.png'.format(global_step)))

        if global_step % self._log_interval_steps == 0:
            dloss, gloss = run_context.session.process(
                [self._d_loss, self._g_loss])
            print_info(
                "\nDiscriminator Loss: {:.4f}... Generator Loss: {:.4f}".
                format(dloss, gloss))
    def _get_val_input_fn(self):
        """
        Inheriting class must implement this
        :return: callable
        """

        dataset = tf.data.Dataset.from_tensor_slices(
            (list(self.VAL_WAV_PAIR.keys()), list(self.VAL_WAV_PAIR.values())))
        dataset = dataset.map(
            lambda wav_file_1, wav_file_2: tuple(
                tf.py_func(self.generate_features, [wav_file_1, wav_file_2],
                           (tf.float32, tf.bool, tf.bool))),
            num_parallel_calls=self._hparams.num_parallel_calls)
        dataset = dataset.map(
            self._user_resize_func,
            num_parallel_calls=self._hparams.num_parallel_calls)

        dataset = dataset.batch(batch_size=self._hparams.batch_size,
                                drop_remainder=True)
        dataset = dataset.prefetch(self._hparams.prefetch_size)
        dataset = dataset.cache(
            filename=os.path.join(self.iterator_dir, "val_data_cache"))
        print_info("Dataset output sizes are: ")
        print_info(dataset.output_shapes)
        return dataset
    def _get_test_input_fn(self):
        file_name = "test_padded_data_" + str(self._use_char_embd) + ".p"
        train_sentences, train_char_ids, train_ner_tags = None, None, None
        data = self.get_padded_data(file_name=file_name)

        if data is None:
            train_sentences, train_char_ids, train_ner_tags = \
                self._make_seq_pair(df_files_path=self.TEST_FILES_IN_PATH,
                                    char_2_id_map=self.CHAR_2_ID_MAP,
                                    use_char_embd=self._use_char_embd)
            self.store_padded_data(data=(train_sentences, train_char_ids, train_ner_tags), file_name=file_name)
        else:
            train_sentences, train_char_ids, train_ner_tags = data

        # print_error(train_char_ids)
        # print_info(train_ner_tags)
        if self._use_char_embd:
            dataset = tf.data.Dataset.from_tensor_slices(({self.FEATURE_1_NAME: train_sentences,
                                                           self.FEATURE_2_NAME: train_char_ids},
                                                          train_ner_tags))
        else:
            dataset = tf.data.Dataset.from_tensor_slices(({self.FEATURE_1_NAME: train_sentences},
                                                          train_ner_tags))
        dataset = dataset.batch(batch_size=self._batch_size)
        print_info("Dataset output sizes are: ")
        print_info(dataset.output_shapes)
        return dataset
Exemple #5
0
    def _get_speaker_files(self, data_dir):  #TODO S3 support
        """

        :param data_dir: dir containing the training data (root_dir + speaker_dir + wavfiles)
        :returns:  speaker_wav_files (dict) : {speaker : [files]}
        """
        # get dirs for each speaker
        speakers_dirs = [os.path.join(data_dir, speaker) for speaker in os.listdir(data_dir) \
                         if os.path.isdir(os.path.join(data_dir, speaker))]
        print_info(speakers_dirs)

        speaker_wav_files_dict = {}

        # get the files in each speakers dir
        # TODO: Convert below to dict-comprehension using collections.defaultdict
        for speaker_dir in speakers_dirs:
            speaker = speaker_dir.split("/")[-1]
            wav_files = [
                os.path.join(speaker_dir, file)
                for file in os.listdir(speaker_dir) if file.endswith("wav")
            ]
            for wav_file in wav_files:
                if speaker not in speaker_wav_files_dict:
                    speaker_wav_files_dict[speaker] = []
                speaker_wav_files_dict[speaker].append(wav_file)

        if len(speaker_wav_files_dict) == 0:
            raise RuntimeError(
                "shabda: No files are not under directory .... {}".format(
                    data_dir))

        return speaker_wav_files_dict
    def __init__(self,
                 experiment_name,
                 name="NaiveConvNet",
                 model_root_directory=os.path.expanduser("~") + "/vitaFlow/",
                 out_dim=-1,
                 learning_rate=0.001,
                 keep_probability=0.5,
                 data_iterator=None):
        ClassifierBase.__init__(self,
                                experiment_name=experiment_name,
                                model_root_directory=model_root_directory,
                                name=name,
                                out_dim=out_dim,
                                learning_rate=learning_rate)
        ImageFeature.__init__(self)
        # self._hparams = HParams(hparams, self.default_hparams())

        self._data_iterator = data_iterator
        self._keep_prob = keep_probability

        self._conv_num_outputs = 32  # TODO
        self._conv_ksize = (5, 5)
        self._conv_strides = (1, 1)
        self._pool_ksize = (2, 2)
        self._pool_strides = (2, 2)

        self._num_outputs = 10  #number of classes # TODO

        print_info("NaiveConvNet initialized")
Exemple #7
0
    def _get_test_input_function(self):
        """
        Inheriting class must implement this
        :return: callable
        """
        dataset = tf.data.Dataset.from_generator(
            self._yield_test_samples, (tf.float32, tf.bool, tf.bool),
            output_shapes=(TensorShape([
                Dimension(self._hparams.frames_per_sample),
                Dimension(self._hparams.neff)
            ]),
                           TensorShape([
                               Dimension(self._hparams.frames_per_sample),
                               Dimension(self._hparams.neff)
                           ]),
                           TensorShape([
                               Dimension(self._hparams.frames_per_sample),
                               Dimension(self._hparams.neff),
                               Dimension(2)
                           ])))
        # Map the generator output as features as a dict and labels
        dataset = dataset.map(lambda x, y, z: ({
            self.FEATURE_1_NAME: x,
            self.FEATURE_2_NAME: y
        }, z))

        dataset = dataset.batch(batch_size=self._hparams.batch_size,
                                drop_remainder=True)
        dataset = dataset.prefetch(self._hparams.prefetch_size)
        # dataset = dataset.cache(filename=os.path.join(self.iterator_dir, "test_data_cache"))
        print_info("Dataset output sizes are: ")
        print_info(dataset.output_shapes)
        return dataset
Exemple #8
0
    def generator(self, z, out_channel_dim, is_train=True):
        """
        Create the _generator network
        :param z: Input z on dimension Z
        :param out_channel_dim: The number of channels in the output image
        :param is_train: Boolean if _generator is being used for training
        :return: The tensor output of the _generator
        """

        with tf.variable_scope('_generator', reuse=False):
            gen_filter_size = self.gen_filter_size

            # x = tf.layers.batch_normalization(z)
            # First fully connected layer
            x = tf.layers.dense(z, 8 * 8 * gen_filter_size)
            # Reshape it to start the convolutional stack
            x = tf.reshape(x, (-1, 8, 8, gen_filter_size))
            # x = tf.layers.batch_normalization(x, training=is_train)
            x = tf.maximum(self.alpha * x, x)

            x = tf.layers.conv2d_transpose(x,
                                           gen_filter_size // 2,
                                           5,
                                           strides=1,
                                           padding='same')
            x = tf.maximum(self.alpha * x, x)

            x = tf.layers.batch_normalization(x, training=is_train)

            gen_filter_size = gen_filter_size // 4
            # 32 //  8 = srt(4)  => 2 => (8) -> 16 -> 32
            # 64 //  8 = srt(8)  => 3 => (8) -> 16 -> 32 -> 64
            # 128 // 8 = srt(16) => 4 => (8) -> 16 -> 32 -> 64 -> 128

            # Based on image size adds Conv layer with appropriate filter size
            for i in range(int(math.sqrt(self.image_size // 8))):
                gen_filter_size = gen_filter_size // 2
                x = tf.layers.conv2d_transpose(x,
                                               gen_filter_size,
                                               5,
                                               strides=2,
                                               padding='same')
                x = tf.maximum(self.alpha * x, x)
                x = tf.layers.batch_normalization(x, training=is_train)

                print_info("======>x at conv layer {} is {}".format(i, x))

            # Output layer
            logits = tf.layers.conv2d_transpose(x,
                                                out_channel_dim,
                                                5,
                                                strides=1,
                                                padding='same')
            # HxWxNUM_CHANNELS now
            out = tf.tanh(logits)

            print_info("======>out: {}".format(out))

            return out
Exemple #9
0
    def generator(self, z, out_channel_dim, is_training=True, reuse=False):
        """
        Create the namespace_generator network
        :param z: Input z
        :param out_channel_dim: The number of channels in the output image
        :param is_training: Boolean if namespace_generator is being used for training
        :return: The tensor output of the namespace_generator
        """

        with tf.variable_scope(
                'namespace_generator',
                reuse=not is_training):  #reuse if it not training phase
            filter_size = 512

            # First fully connected layer
            x = tf.layers.dense(z, 8 * 8 * filter_size)
            # Reshape it to start the convolutional stack
            x = tf.reshape(x, (-1, 8, 8, filter_size))
            x = tf.maximum(self.alpha * x, x)

            x = tf.layers.conv2d_transpose(x,
                                           filter_size // 2,
                                           5,
                                           strides=1,
                                           padding='same')
            x = tf.layers.batch_normalization(x, training=is_training)
            x = tf.maximum(self.alpha * x, x)

            filter_size = filter_size // 4
            # 32 //  8 = srt(4)  => 2 => (8) -> 16 -> 32
            # 64 //  8 = srt(8)  => 3 => (8) -> 16 -> 32 -> 64
            # 128 // 8 = srt(16) => 4 => (8) -> 16 -> 32 -> 64 -> 128

            for i in range(int(math.sqrt(self.image_size // 8))):
                filter_size = filter_size // 2
                x = tf.layers.conv2d_transpose(x,
                                               filter_size,
                                               5,
                                               strides=2,
                                               padding='same')
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.maximum(self.alpha * x, x)

                print_info("======>out: {}".format(x))

            # Output layer
            logits = tf.layers.conv2d_transpose(x,
                                                out_channel_dim,
                                                5,
                                                strides=1,
                                                padding='same')
            # 28x28x3 now
            #         print(logits)3
            out = tf.tanh(logits)

            print_info("======>out: {}".format(out))

            return out
 def read_pickle(self, file_name):
     file_path = os.path.join(self.dataset_dir, file_name)
     if os.path.exists(file_path):
         print_info("Reading the pickle file {}...".format(file_path))
         with open(file_path, 'rb') as f:
             data = pickle.load(f)
         return data
     else:
         return None
 def get_padded_data(self, file_name):
     file_path = os.path.join(self.EXPERIMENT_ROOT_DIR, file_name)
     if os.path.exists(file_path):
         print_info("Reading the padded data...")
         with open(file_path, 'rb') as f:
             data = pickle.load(f)
         return data
     else:
         return None
Exemple #12
0
 def get_dataset():
     dataset = tf.data.Dataset.from_tensor_slices((
         {self.FEATURE_1_NAME: in_data_features,
          self.FEATURE_2_NAME: voice_activity_detection_data_features},
         np.ones_like(in_data_features)
     ))
     dataset = dataset.batch(batch_size=1)
     print_info(dataset.output_shapes)
     return dataset
Exemple #13
0
    def _discriminator(self, images, reuse=False):
        """
        Create the _discriminator network
        :param image: Tensor of input image(s)
        :param reuse: Boolean if the weights should be reused
        :return: Tuple of (tensor output of the _discriminator, tensor logits of the _discriminator)
        """

        with tf.variable_scope('_discriminator', reuse=reuse):
            # Input layer consider ?x32x32x3
            x1 = tf.layers.conv2d(
                images,
                64,
                5,
                strides=2,
                padding='same',
                kernel_initializer=tf.random_normal_initializer(stddev=0.02))
            relu1 = tf.maximum(0.02 * x1, x1)
            relu1 = tf.layers.dropout(relu1, rate=0.5)
            # 16x16x64
            x2 = tf.layers.conv2d(
                relu1,
                128,
                5,
                strides=2,
                padding='same',
                kernel_initializer=tf.random_normal_initializer(stddev=0.02))
            bn2 = tf.layers.batch_normalization(x2, training=True)
            relu2 = tf.maximum(0.02 * bn2, bn2)
            relu2 = tf.layers.dropout(relu2, rate=0.5)
            # 8x8x128
            x3 = tf.layers.conv2d(
                relu2,
                256,
                5,
                strides=2,
                padding='same',
                kernel_initializer=tf.random_normal_initializer(stddev=0.02))
            bn3 = tf.layers.batch_normalization(x3, training=True)
            relu3 = tf.maximum(0.02 * bn3, bn3)
            relu3 = tf.layers.dropout(relu3, rate=0.5)
            # 4x4x256
            # Flatten it
            flat = tf.reshape(relu3, (-1, 4 * 4 * 256))
            logits = tf.layers.dense(flat, 1)
            #         print(logits)
            out = tf.sigmoid(logits)
            #         print('_discriminator out: ', out)

            print_info("======> _discriminator out: {}".format(out))

            return out, logits
Exemple #14
0
def image_annotations(path_to_tensorflow_model, category_index, images_src, images_dest):
    def get_box_dims(box, image_shape):
        ymin, xmin, ymax, xmax = box
        im_width, im_height, im_depth = image_shape
        ymin, xmin, ymax, xmax = map(int, (xmin * im_width, xmax * im_width,
                                           ymin * im_height, ymax * im_height))
        return (ymax, xmax, ymin, xmin)

    detection_graph = tf.Graph()
    with detection_graph.as_default():
        od_graph_def = tf.GraphDef()
        with tf.gfile.GFile(path_to_tensorflow_model, 'rb') as fid:
            serialized_graph = fid.read()
            od_graph_def.ParseFromString(serialized_graph)
            tf.import_graph_def(od_graph_def, name='')
        sess = tf.Session(graph=detection_graph)

    image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
    detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
    detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
    detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
    num_detections = detection_graph.get_tensor_by_name('num_detections:0')

    bag = []

    for image_path in tqdm(glob(images_src+"/*")):
        print_info("Processing {}".format(image_path))
        image = plt.imread(image_path)
        image_expanded = np.expand_dims(image, axis=0)

        # Perform the actual detection by running the model with the image as input
        (boxes, scores, classes, num) = sess.run(
            [detection_boxes, detection_scores, detection_classes, num_detections],
            feed_dict={image_tensor: image_expanded})

        mask = scores > 0.3

        image_shape = image.shape

        coords = list(map(lambda x: get_box_dims(x, image_shape), boxes[mask].tolist()))
        tags = list(map(lambda x: category_index[int(x)]['name'], classes[mask].tolist()))
        scores = scores[mask].tolist()
        bag.append({'image_loc': image_path,
                    'dest': images_dest,
                    'coords': coords,
                    'tags': tags,
                    'scores': scores})

    # pprint(bag)
    return bag
Exemple #15
0
    def parallel_convert(self):
        print_info("Running OCR : {}".format(self._image_dir))
        with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
            image_list = glob.glob(self._image_dir + os.sep + "*/*.jpg")
            image_list.extend(glob.glob(self._image_dir + os.sep + "*/*.jpeg"))
            image_list.extend(glob.glob(self._image_dir + os.sep + "*/*.png"))

            # print_info(image_list)
            try:
                for img_path, out_file in zip(
                        image_list, executor.map(self.convert, image_list)):
                    print(img_path, ',', out_file, ', processed')
            except:
                pass
Exemple #16
0
    def _get_test_input_function(self):
        """
        Inheriting class must implement this
        :return: callable
        """
        dataset = tf.data.TFRecordDataset(
            glob.glob(
                os.path.join(self._dataset.TEST_OUT_PATH,
                             "tfrecords/*.tfrecord")),
            num_parallel_reads=self._hparams.num_threads)
        # Map the generator output as features as a dict and labels
        dataset = dataset.map(self.decode)

        dataset = dataset.batch(batch_size=self._hparams.batch_size,
                                drop_remainder=True)
        dataset = dataset.prefetch(self._hparams.prefetch_size)
        print_info("Dataset output sizes are: ")
        print_info(dataset.output_shapes)
        return dataset
    def _build(self, features, labels, params, mode, config=None):

        images = features[self.FEATURE_NAME]

        shape = images.get_shape()
        assert (len(shape) == 4)
        batch = shape[0].value
        rows = shape[1].value
        cols = shape[2].value
        channel = shape[3].value

        print_info("{} {} {}".format(batch, rows, cols))

        # Loss, training and eval operations are not needed during inference.
        loss = None
        optimizer = None
        eval_metric_ops = {}

        logits = self._build_layers(features=images, mode=mode)
        predicted_class = self._get_predicted_classes(logits=logits)
        predicted_probabilities = self._get_class_probabilities(logits=logits)
        # top_k = self._get_top_k_predictions(logits=logits)
        predictions = {
            "classes": predicted_class,
            "probabilities": predicted_probabilities,
            "logits": logits
        }

        if mode != tf.estimator.ModeKeys.PREDICT:
            # labels = tf.reshape(labels, shape=(-1, self._out_dim), name="labels")
            tf.logging.info('labels: -----> {}'.format(labels))

            loss = self._get_loss(labels=labels, logits=logits)
            optimizer = self._get_optimizer(loss)
            eval_metric_ops = self._get_eval_metrics(logits=logits,
                                                     labels=labels)

        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          loss=loss,
                                          train_op=optimizer,
                                          eval_metric_ops=eval_metric_ops)
    def _get_predict_single_input_function(self, data):

        train_sentences, train_char_ids, train_ner_tags = None, None, None

        train_sentences, train_char_ids, train_ner_tags = \
            self._make_seq_pair_text(sentence=data,
                                     char_2_id_map=self.CHAR_2_ID_MAP,
                                     use_char_embd=self._use_char_embd)

        # print_error(train_char_ids)
        # print_info(train_ner_tags)
        if self._use_char_embd:
            dataset = tf.data.Dataset.from_tensor_slices(({self.FEATURE_1_NAME: train_sentences,
                                                           self.FEATURE_2_NAME: train_char_ids}, np.zeros(1)))
        else:
            dataset = tf.data.Dataset.from_tensor_slices(({self.FEATURE_1_NAME: train_sentences}, np.zeros(1)))
        dataset = dataset.batch(batch_size=self._batch_size)
        print_info("Dataset output sizes are: ")
        print_info(dataset.output_shapes)
        return dataset
Exemple #19
0
    def _get_train_input_fn(self):
        """
        Inheriting class must implement this
        :return: dataset
        """
        # TF dataset APIs
        dataset = tf.data.TFRecordDataset(
            glob.glob(
                os.path.join(self._dataset.TRAIN_OUT_PATH,
                             "tfrecords/*.tfrecord")),
            num_parallel_reads=self._hparams.num_threads)
        # Map the generator output as features as a dict and labels
        dataset = dataset.map(self.decode)

        dataset = dataset.batch(batch_size=self._hparams.batch_size,
                                drop_remainder=True)
        dataset = dataset.prefetch(self._hparams.prefetch_size)
        # dataset = dataset.cache(filename=os.path.join(self.iterator_dir, "train_data_cache"))
        print_info("Dataset output sizes are: ")
        print_info(dataset.output_shapes)

        return dataset
Exemple #20
0
    def convert_pdf(self, pdf_path):
        """
        Reference: https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/
        :param pdf_path:
        :return:
        """

        tool = pyocr.get_available_tools()[0]
        lang = tool.get_available_languages()[0]

        print_info(tool.get_available_languages())
        pdf_path = os.path.normpath(pdf_path)
        file_name = pdf_path.split(os.sep)[-1].split(".")[0]

        # with Image(filename=pdf_path, resolution=300) as img:
        #     img.compression_quality = 99
        #     img.save(filename=os.path.join(self._image_dir,file_name))

        req_image = []
        final_text = []
        text_file_path = ""

        image_pdf = Image(filename=pdf_path, resolution=300)
        image_jpeg = image_pdf.convert('jpeg')
        for img in image_jpeg.sequence:
            img_page = Image(image=img)
            req_image.append(img_page.make_blob('jpeg'))

        for i, img in tqdm(enumerate(req_image)):
            text = tool.image_to_string(PI.open(io.BytesIO(img)),
                                        lang=lang,
                                        builder=pyocr.builders.TextBuilder())
            text_file_path = os.path.join(self._text_out_dir,
                                          file_name + str(i) + ".txt")
            with open(text_file_path, "w") as fd:
                fd.write("%s" % text)
        return text_file_path
 def _create_target_directories(self):
     """
     To setup destination folders structure if not present.
     :return:
     """
     if os.path.exists(self.PREPROCESSED_DATA_OUT_DIR):
         if self._over_write:
             print_info("Deleting data folder: {}".format(
                 self.PREPROCESSED_DATA_OUT_DIR))
             shutil.rmtree(self.PREPROCESSED_DATA_OUT_DIR)
             print_info("Recreating data folder: {}".format(
                 self.PREPROCESSED_DATA_OUT_DIR))
             os.makedirs(self.PREPROCESSED_DATA_OUT_DIR)
         else:
             print_info(
                 "Skipping preprocessing step, since the data might already be available"
             )
     else:
         print_info("Creating data folder: {}".format(
             self.PREPROCESSED_DATA_OUT_DIR))
         os.makedirs(self.PREPROCESSED_DATA_OUT_DIR)
 def copy(self, in_path, out_dir):
     path, file_name = os.path.split(in_path)
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
         print_info("Copying the file {} to {}".format(in_path, out_dir))
         shutil.copy(src=in_path, dst=out_dir)
     else:
         if not os.path.exists(os.path.join(out_dir, file_name)):
             print_info("Copying the file {} to {}".format(
                 in_path, out_dir))
             shutil.copy(src=in_path, dst=out_dir)
         else:
             print_info("Found previous copy @ {}".format(
                 os.path.join(out_dir, file_name)))
Exemple #23
0
    def preprocess_prepare(self):
        if not os.path.exists(os.path.join(self.TRAIN_OUT_PATH, "clips")):
            self._initialize_spark()
            self._extract_clips(os.path.join(self.TRAIN_IN_PATH, "sph"),
                                os.path.join(self.TRAIN_OUT_PATH, "clips"))
        if not os.path.exists(os.path.join(self.VAL_OUT_PATH, "clips")):
            self._initialize_spark()
            self._extract_clips(os.path.join(self.VAL_IN_PATH, "sph"),
                                os.path.join(self.VAL_OUT_PATH, "clips"))
        if not os.path.exists(os.path.join(self.TEST_OUT_PATH, "clips")):
            self._initialize_spark()
            self._extract_clips(os.path.join(self.TEST_IN_PATH, "sph"),
                                os.path.join(self.TEST_OUT_PATH, "clips"))

        self._prepare_wav_pairs()

        if not os.path.exists(os.path.join(self.TRAIN_OUT_PATH, "tfrecords")):
            self._initialize_spark()
            print_info("Processing {} wav pairs, have a break...".format(
                len(self.TRAIN_WAV_PAIR)))
            self._generate_mix_speeches(
                self.TRAIN_WAV_PAIR,
                os.path.join(self.TRAIN_OUT_PATH, "tfrecords"))
        if not os.path.exists(os.path.join(self.VAL_OUT_PATH, "tfrecords")):
            self._initialize_spark()
            print_info("Processing {} wav pairs, have a break...".format(
                len(self.VAL_WAV_PAIR)))
            self._generate_mix_speeches(
                self.VAL_WAV_PAIR, os.path.join(self.VAL_OUT_PATH,
                                                "tfrecords"))
        if not os.path.exists(os.path.join(self.TEST_OUT_PATH, "tfrecords")):
            self._initialize_spark()
            print_info("Processing {} wav pairs, have a break...".format(
                len(self.TEST_WAV_PAIR)))
            self._generate_mix_speeches(
                self.TEST_WAV_PAIR,
                os.path.join(self.TEST_OUT_PATH, "tfrecords"))
    def _extract_vocab(self):
        """
        Uses the preprocessed data from the configured location and extracts
        the word and character level vocab.
        :return:
        """

        if not os.path.exists(self.WORDS_VOCAB_FILE) \
                or not os.path.exists(self.ENTITY_VOCAB_FILE) \
                or not os.path.exists(self.CHARS_VOCAB_FILE):
            print_info("Preparing the vocab for the text col: {}".format(self._text_col))

            lines = set()
            entities = set()

            for df_file in tqdm(os.listdir(self.TRAIN_FILES_IN_PATH), desc="mergining lines"):
                df_file = os.path.join(self.TRAIN_FILES_IN_PATH, df_file)

                if df_file.endswith(".csv"):
                    df = pd.read_csv(df_file, sep=self._in_seperator,quoting= csv.QUOTE_NONE )#.fillna(SpecialTokens.UNK_WORD)
                    df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
                else:
                    raise RuntimeError
                # print(df["0,1,2,3"])

                lines.update(set(df[self._text_col].values.tolist()))
                entities.update(set(df[self._entity_col].values.tolist()))

            self.WORD_VOCAB_SIZE, words_vocab = naive_vocab_creater(lines=lines,
                                                                    out_file_name=self.WORDS_VOCAB_FILE,
                                                                    use_nlp=True)

            print_info("Preparing the character vocab for the text col: {}".format(self._text_col))

            # Get char level vocab
            char_vocab = [SpecialTokens.PAD_CHAR, SpecialTokens.UNK_CHAR]
            _vocab = get_char_vocab(words_vocab)
            char_vocab.extend(_vocab)

            # Create char2id map
            self.CHAR_2_ID_MAP = vocab_to_tsv(vocab_list=char_vocab,
                                              out_file_name=self.CHARS_VOCAB_FILE)

            self.CHAR_VOCAB_SIZE = len(self.CHAR_2_ID_MAP)

            print_info("Preparing the vocab for the entity col: {}".format(self._entity_col))

            # NUM_TAGS, tags_vocab = tf_vocab_processor(lines, ENTITY_VOCAB_FILE)
            self.NUM_TAGS, tags_vocab = naive_vocab_creater(lines=entities,
                                                            out_file_name=self.ENTITY_VOCAB_FILE,
                                                            use_nlp=False)
        else:
            print_info("Reusing the vocab")
            self.WORD_VOCAB_SIZE, words_vocab = naive_vocab_creater(lines=None,
                                                                    out_file_name=self.WORDS_VOCAB_FILE,
                                                                    use_nlp=None)
            self.CHAR_2_ID_MAP = vocab_to_tsv(out_file_name=self.CHARS_VOCAB_FILE,
                                              vocab_list=None)
            self.CHAR_VOCAB_SIZE = len(self.CHAR_2_ID_MAP)

            self.NUM_TAGS, tags_vocab = naive_vocab_creater(lines=None,
                                                            out_file_name=self.ENTITY_VOCAB_FILE,
                                                            use_nlp=False)

        self.TAGS_2_ID = {id_num: tag for id_num, tag in enumerate(tags_vocab)}
    def generate_features(self, wav_file_1, wav_file_2):

        try:
            start = time.time()
            speech_1, _ = librosa.core.load(wav_file_1,
                                            sr=self._hparams.sampling_rate)
            # amp factor between -3 dB - 3 dB
            fac = np.random.rand(1)[0] * 6 - 3
            speech_1 = 10.**(fac / 20) * speech_1

            speech_2, _ = librosa.core.load(wav_file_2,
                                            sr=self._hparams.sampling_rate)
            fac = np.random.rand(1)[0] * 6 - 3
            speech_2 = 10.**(fac / 20) * speech_2

            # mix
            length = min(len(speech_1), len(speech_2))
            speech_1 = speech_1[:length]
            speech_2 = speech_2[:length]
            speech_mix = speech_1 + speech_2

            # compute log spectrum for 1st speaker
            speech_1_features = np.abs(
                stft(speech_1,
                     self._hparams.frame_size)[:, :self._hparams.neff])
            speech_1_features = np.maximum(
                speech_1_features,
                np.max(speech_1_features) / self._hparams.min_amp)
            speech_1_features = 20. * np.log10(
                speech_1_features * self._hparams.amp_fac)

            # same for the 2nd speaker
            speech_2_features = np.abs(
                stft(speech_2,
                     self._hparams.frame_size)[:, :self._hparams.neff])
            speech_2_features = np.maximum(
                speech_2_features,
                np.max(speech_2_features) / self._hparams.min_amp)
            speech_2_features = 20. * np.log10(
                speech_2_features * self._hparams.amp_fac)

            # same for the mixture
            speech_mix_spec0 = stft(
                speech_mix, self._hparams.frame_size)[:, :self._hparams.neff]
            speech_mix_features = np.abs(speech_mix_spec0)

            # speech_phase = speech_mix_spec0 / speech_mix_spec
            speech_mix_features = np.maximum(
                speech_mix_features,
                np.max(speech_mix_features) / self._hparams.min_amp)
            speech_mix_features = 20. * np.log10(
                speech_mix_features * self._hparams.amp_fac)
            max_mag = np.max(speech_mix_features)

            # if np.isnan(max_mag):
            # import ipdb; ipdb.set_trace()
            speech_VAD = (speech_mix_features >
                          (max_mag - self._hparams.threshold)).astype(int)

            speech_mix_features = (
                speech_mix_features -
                self._hparams.global_mean) / self._hparams.global_std

            #The ideal binary mask gives ownership of a time-frequency bin to the source whose magnitude is
            # maximum among all sources in that bin.
            # The mask values were assigned with 1 for active and 0 otherwise (binary),
            # making Y x Y^T as the ideal affinity matrix for the mixture.
            Y = np.array([
                speech_1_features > speech_2_features,
                speech_1_features < speech_2_features
            ]).astype('bool')
            Y = np.transpose(Y, [1, 2, 0]).astype('bool')

            # speech_mix_features = speech_mix_features[0:self._hparams.dummy_slicing_dim, :]
            # speech_VAD = speech_VAD[0:self._hparams.dummy_slicing_dim, :]
            # Y = Y[0:self._hparams.dummy_slicing_dim, :, :]

            # print_info("{} vs {}".format(wav_file_1, wav_file_2))
            end = time.time()

            print_info("Thread name: {} : took {}".format(
                threading.currentThread().getName(), end - start))

            if speech_mix_features.shape[0] != 1247 or speech_VAD.shape[
                    0] != 1247 or Y.shape[0] != 1247:
                raise Exception("Found files with improper duration/data")

            return speech_mix_features.astype('float32'), speech_VAD.astype(
                'bool'), Y.astype('bool')
        except Exception as e:
            print_warn(e)
            print_error("{} vs {}".format(wav_file_1, wav_file_2))
            return np.random.random((self._hparams.dummy_slicing_dim,129)).astype('float32'), \
                   np.empty((self._hparams.dummy_slicing_dim,129), dtype="bool"), \
                   np.empty((self._hparams.dummy_slicing_dim,129, 2), dtype="bool")
Exemple #26
0
    def discriminator(self, x, out_channel_dim, is_training=True, reuse=False):
        # It must be Auto-Encoder style architecture
        # Architecture : (64)4c2s-FC32_BR-FC64*14*14_BR-(1)4dc2s_S
        with tf.variable_scope("namespace_discriminator", reuse=reuse):
            # net = tf.nn.relu(conv2d(x, 64, 4, 4, 2, 2, name='d_conv1'))
            net = tf.layers.conv2d(
                x,
                64,
                4,
                strides=2,
                padding='same',
                kernel_initializer=tf.random_normal_initializer(stddev=0.02),
                name='d_conv1')
            net = tf.nn.relu(net)

            tf.logging.info("======> net: {}".format(net))
            print_error("net1: {} ".format(net))

            size = (self.image_size // 2)

            net = tf.reshape(
                net, [self._data_iterator.batch_size, size * size * 64])

            # code = tf.nn.relu(bn(linear(net, 32, scope='d_fc6'), is_training=is_training, scope='d_bn6'))
            code = tf.contrib.layers.fully_connected(inputs=net,
                                                     num_outputs=32,
                                                     scope="d_fc6")
            code = tf.contrib.layers.batch_norm(code,
                                                decay=0.9,
                                                updates_collections=None,
                                                epsilon=1e-5,
                                                scale=True,
                                                is_training=is_training,
                                                scope='d_bn6')
            code = tf.nn.relu(code)

            print_error("code: {} ".format(code))
            # net = tf.nn.relu(bn(linear(code, 64 * 14 * 14, scope='d_fc3'), is_training=is_training, scope='d_bn3'))
            size = (self.image_size // 2)
            net = tf.contrib.layers.fully_connected(inputs=code,
                                                    num_outputs=64 * size *
                                                    size,
                                                    scope="d_fc3")

            net = tf.contrib.layers.batch_norm(net,
                                               decay=0.9,
                                               updates_collections=None,
                                               epsilon=1e-5,
                                               scale=True,
                                               is_training=is_training,
                                               scope='d_bn3')
            print_error("net: {} ".format(net))
            print_error(net)

            size = (self.image_size // 2)
            net = tf.reshape(net,
                             [self._data_iterator.batch_size, size, size, 64])
            print_error(net)

            # out = tf.nn.sigmoid(deconv2d(net, [self.gan_config.batch_size, 28, 28, 1], 4, 4, 2, 2, name='d_dc5'))
            net = tf.layers.conv2d_transpose(net,
                                             out_channel_dim,
                                             4,
                                             strides=2,
                                             padding='same',
                                             name='d_dc5')
            out = tf.nn.sigmoid(net)

            print_info("==================================")
            print_info(out)
            print_info(x)
            # recon loss
            recon_error = tf.sqrt(
                2 * tf.nn.l2_loss(out - x)) / self._data_iterator.batch_size
            print_info("==================================")
            print_error(recon_error)

            return out, recon_error, code
 def store_as_pickle(self, data, file_name):
     file_path = os.path.join(self.dataset_dir, file_name)
     print_info("Writing the pickle file {}...".format(file_path))
     with open(file_path, 'wb') as f:
         pickle.dump(data, f)
     return None
 def store_padded_data(self, file_name, data):
     file_path = os.path.join(self.EXPERIMENT_ROOT_DIR, file_name)
     print_info("Writing the padded data...")
     with open(file_path, 'wb') as f:
         pickle.dump(data, f)
     return None
Exemple #29
0
    def visulaize(self, executor, file_path):
        """

        :param executor:
        :param test_file_path:
        :return:
        """

        estimator = executor.estimator

        in_data_features, voice_activity_detection_data_features, phase_features = self._get_predict_samples(
            file_path=file_path)

        in_data_features = np.asarray(in_data_features)
        voice_activity_detection_data_features = np.asarray(
            voice_activity_detection_data_features)

        N_frames = in_data_features.shape[0]
        hop_size = self._hparams.frame_size // 4

        def get_dataset():
            dataset = tf.data.Dataset.from_tensor_slices(({
                self.FEATURE_1_NAME:
                in_data_features,
                self.FEATURE_2_NAME:
                voice_activity_detection_data_features
            }, np.ones_like(in_data_features)))
            dataset = dataset.batch(batch_size=1)
            print_info(dataset.output_shapes)
            return dataset

        predict_fn = estimator.predict(input_fn=lambda: get_dataset())

        print_info("Shape of in data: {}".format(in_data_features.shape))
        print_info("Number of frames for given file: {}".format(N_frames))

        embeddings = []
        i = 0

        for predicted_value in predict_fn:
            # print("i = {}".format(i))
            """
            TODO:
            strange behaviour!
            
            1 wav file = N samples
            Eg: N = 600
            FramesPerSample=100, BatchSize = 1, NEFF = 129, EMD_K = 30
            
            For each sample the embeddings is of shape [batch_size * frames_per_sample, NEFF, embd_dim].
            For prediction batch size is made 1.
            Hence the embeddings colapse to [frames_per_sample, NEFF, embd_dim]
            1 sample predictions will have `frames_per_sample` outputs
            Eg: If input audio file has 75 frames, the prediction will have [7500, NEFF, embd_dim]
            """
            embeddings.append(predicted_value)
            i += 1

        print_info("Number of embeddings predicted for given file: {}".format(
            len(embeddings)))
        print_error(np.asarray(embeddings).shape)

        N_assign = 0
        step = 0

        for frame_i in tqdm(range(N_frames)):

            # expand the dimesion to be inline with TF batch size
            in_data_np = np.expand_dims(in_data_features[frame_i], axis=0)
            in_phase_np = np.expand_dims(phase_features[frame_i], axis=0)
            voice_activity_detection_data_np = np.expand_dims(
                voice_activity_detection_data_features[frame_i], axis=0)
            embedding_np = np.asarray(
                embeddings[frame_i:frame_i + self._hparams.frames_per_sample])

            # ----------------------------------------------

            embedding_ac = []
            for i, j in itertools.product(
                    range(self._hparams.frames_per_sample),
                    range(self._hparams.neff)):
                if voice_activity_detection_data_np[0, i, j] == 1:
                    embedding_ac.append(embedding_np[i, j, :])

            kmean = KMeans(n_clusters=2, random_state=0).fit(embedding_ac)
            # visualization using 3 PCA
            pca_Data = PCA(n_components=3).fit_transform(embedding_ac)
            fig = plt.figure(1, figsize=(8, 6))
            ax = Axes3D(fig, elev=-150, azim=110)
            # ax.scatter(pca_Data[:, 0], pca_Data[:, 1], pca_Data[:, 2],
            #            c=kmean.labels_, cmap=plt.cm.Paired)
            ax.scatter(pca_Data[:, 0],
                       pca_Data[:, 1],
                       pca_Data[:, 2],
                       cmap=plt.cm.Paired)
            ax.set_title('Embedding visualization using the first 3 PCs')
            ax.set_xlabel('1st pc')
            ax.set_ylabel('2nd pc')
            ax.set_zlabel('3rd pc')
            if not os.path.exists("vis"):
                os.makedirs("vis")
            plt.savefig('vis/' + str(step) + 'pca.jpg')

            step += 1
Exemple #30
0
 def convert(self, path):
     print_info(path)
     if path.endswith("pdf"):
         return self.convert_pdf(pdf_path=path)
     else:
         return self.convert_image(image_path=path)