def _get_test_input_function(self): """ Inheriting class must implement this :return: callable """ dataset = tf.data.Dataset.from_generator( self._yield_test_samples, (tf.float32, tf.bool, tf.bool), output_shapes=(TensorShape([ Dimension(self._hparams.frames_per_sample), Dimension(self._hparams.neff) ]), TensorShape([ Dimension(self._hparams.frames_per_sample), Dimension(self._hparams.neff) ]), TensorShape([ Dimension(self._hparams.frames_per_sample), Dimension(self._hparams.neff), Dimension(2) ]))) dataset = dataset.map( self.feature_map_func, num_parallel_calls=self._hparams.num_parallel_calls) dataset = dataset.batch(batch_size=self._hparams.batch_size, drop_remainder=True) dataset = dataset.prefetch(self._hparams.prefetch_size) dataset = dataset.cache( filename=os.path.join(self.iterator_dir, "test_data_cache")) print_info("Dataset output sizes are: ") print_info(dataset.output_shapes) return dataset
def before_run(self, run_context): if self._path is None: self._path = os.path.join(os.path.expanduser("~"), "vitaFlow/runtime/GAN") global_step = run_context.session.process(self._global_Step) print_info("global_step {}".format(global_step)) if global_step % self._store_interval_steps == 0: #store every n steps samples = run_context.session.process(self._z_image) channel = self._z_image.get_shape()[-1] if channel == 1: images_grid = images_square_grid(samples, "L") else: images_grid = images_square_grid(samples, "RGB") if not os.path.exists(self._path): os.makedirs(self._path) images_grid.save( os.path.join(self._path, 'step_{}.png'.format(global_step))) if global_step % self._log_interval_steps == 0: dloss, gloss = run_context.session.process( [self._d_loss, self._g_loss]) print_info( "\nDiscriminator Loss: {:.4f}... Generator Loss: {:.4f}". format(dloss, gloss))
def _get_val_input_fn(self): """ Inheriting class must implement this :return: callable """ dataset = tf.data.Dataset.from_tensor_slices( (list(self.VAL_WAV_PAIR.keys()), list(self.VAL_WAV_PAIR.values()))) dataset = dataset.map( lambda wav_file_1, wav_file_2: tuple( tf.py_func(self.generate_features, [wav_file_1, wav_file_2], (tf.float32, tf.bool, tf.bool))), num_parallel_calls=self._hparams.num_parallel_calls) dataset = dataset.map( self._user_resize_func, num_parallel_calls=self._hparams.num_parallel_calls) dataset = dataset.batch(batch_size=self._hparams.batch_size, drop_remainder=True) dataset = dataset.prefetch(self._hparams.prefetch_size) dataset = dataset.cache( filename=os.path.join(self.iterator_dir, "val_data_cache")) print_info("Dataset output sizes are: ") print_info(dataset.output_shapes) return dataset
def _get_test_input_fn(self): file_name = "test_padded_data_" + str(self._use_char_embd) + ".p" train_sentences, train_char_ids, train_ner_tags = None, None, None data = self.get_padded_data(file_name=file_name) if data is None: train_sentences, train_char_ids, train_ner_tags = \ self._make_seq_pair(df_files_path=self.TEST_FILES_IN_PATH, char_2_id_map=self.CHAR_2_ID_MAP, use_char_embd=self._use_char_embd) self.store_padded_data(data=(train_sentences, train_char_ids, train_ner_tags), file_name=file_name) else: train_sentences, train_char_ids, train_ner_tags = data # print_error(train_char_ids) # print_info(train_ner_tags) if self._use_char_embd: dataset = tf.data.Dataset.from_tensor_slices(({self.FEATURE_1_NAME: train_sentences, self.FEATURE_2_NAME: train_char_ids}, train_ner_tags)) else: dataset = tf.data.Dataset.from_tensor_slices(({self.FEATURE_1_NAME: train_sentences}, train_ner_tags)) dataset = dataset.batch(batch_size=self._batch_size) print_info("Dataset output sizes are: ") print_info(dataset.output_shapes) return dataset
def _get_speaker_files(self, data_dir): #TODO S3 support """ :param data_dir: dir containing the training data (root_dir + speaker_dir + wavfiles) :returns: speaker_wav_files (dict) : {speaker : [files]} """ # get dirs for each speaker speakers_dirs = [os.path.join(data_dir, speaker) for speaker in os.listdir(data_dir) \ if os.path.isdir(os.path.join(data_dir, speaker))] print_info(speakers_dirs) speaker_wav_files_dict = {} # get the files in each speakers dir # TODO: Convert below to dict-comprehension using collections.defaultdict for speaker_dir in speakers_dirs: speaker = speaker_dir.split("/")[-1] wav_files = [ os.path.join(speaker_dir, file) for file in os.listdir(speaker_dir) if file.endswith("wav") ] for wav_file in wav_files: if speaker not in speaker_wav_files_dict: speaker_wav_files_dict[speaker] = [] speaker_wav_files_dict[speaker].append(wav_file) if len(speaker_wav_files_dict) == 0: raise RuntimeError( "shabda: No files are not under directory .... {}".format( data_dir)) return speaker_wav_files_dict
def __init__(self, experiment_name, name="NaiveConvNet", model_root_directory=os.path.expanduser("~") + "/vitaFlow/", out_dim=-1, learning_rate=0.001, keep_probability=0.5, data_iterator=None): ClassifierBase.__init__(self, experiment_name=experiment_name, model_root_directory=model_root_directory, name=name, out_dim=out_dim, learning_rate=learning_rate) ImageFeature.__init__(self) # self._hparams = HParams(hparams, self.default_hparams()) self._data_iterator = data_iterator self._keep_prob = keep_probability self._conv_num_outputs = 32 # TODO self._conv_ksize = (5, 5) self._conv_strides = (1, 1) self._pool_ksize = (2, 2) self._pool_strides = (2, 2) self._num_outputs = 10 #number of classes # TODO print_info("NaiveConvNet initialized")
def _get_test_input_function(self): """ Inheriting class must implement this :return: callable """ dataset = tf.data.Dataset.from_generator( self._yield_test_samples, (tf.float32, tf.bool, tf.bool), output_shapes=(TensorShape([ Dimension(self._hparams.frames_per_sample), Dimension(self._hparams.neff) ]), TensorShape([ Dimension(self._hparams.frames_per_sample), Dimension(self._hparams.neff) ]), TensorShape([ Dimension(self._hparams.frames_per_sample), Dimension(self._hparams.neff), Dimension(2) ]))) # Map the generator output as features as a dict and labels dataset = dataset.map(lambda x, y, z: ({ self.FEATURE_1_NAME: x, self.FEATURE_2_NAME: y }, z)) dataset = dataset.batch(batch_size=self._hparams.batch_size, drop_remainder=True) dataset = dataset.prefetch(self._hparams.prefetch_size) # dataset = dataset.cache(filename=os.path.join(self.iterator_dir, "test_data_cache")) print_info("Dataset output sizes are: ") print_info(dataset.output_shapes) return dataset
def generator(self, z, out_channel_dim, is_train=True): """ Create the _generator network :param z: Input z on dimension Z :param out_channel_dim: The number of channels in the output image :param is_train: Boolean if _generator is being used for training :return: The tensor output of the _generator """ with tf.variable_scope('_generator', reuse=False): gen_filter_size = self.gen_filter_size # x = tf.layers.batch_normalization(z) # First fully connected layer x = tf.layers.dense(z, 8 * 8 * gen_filter_size) # Reshape it to start the convolutional stack x = tf.reshape(x, (-1, 8, 8, gen_filter_size)) # x = tf.layers.batch_normalization(x, training=is_train) x = tf.maximum(self.alpha * x, x) x = tf.layers.conv2d_transpose(x, gen_filter_size // 2, 5, strides=1, padding='same') x = tf.maximum(self.alpha * x, x) x = tf.layers.batch_normalization(x, training=is_train) gen_filter_size = gen_filter_size // 4 # 32 // 8 = srt(4) => 2 => (8) -> 16 -> 32 # 64 // 8 = srt(8) => 3 => (8) -> 16 -> 32 -> 64 # 128 // 8 = srt(16) => 4 => (8) -> 16 -> 32 -> 64 -> 128 # Based on image size adds Conv layer with appropriate filter size for i in range(int(math.sqrt(self.image_size // 8))): gen_filter_size = gen_filter_size // 2 x = tf.layers.conv2d_transpose(x, gen_filter_size, 5, strides=2, padding='same') x = tf.maximum(self.alpha * x, x) x = tf.layers.batch_normalization(x, training=is_train) print_info("======>x at conv layer {} is {}".format(i, x)) # Output layer logits = tf.layers.conv2d_transpose(x, out_channel_dim, 5, strides=1, padding='same') # HxWxNUM_CHANNELS now out = tf.tanh(logits) print_info("======>out: {}".format(out)) return out
def generator(self, z, out_channel_dim, is_training=True, reuse=False): """ Create the namespace_generator network :param z: Input z :param out_channel_dim: The number of channels in the output image :param is_training: Boolean if namespace_generator is being used for training :return: The tensor output of the namespace_generator """ with tf.variable_scope( 'namespace_generator', reuse=not is_training): #reuse if it not training phase filter_size = 512 # First fully connected layer x = tf.layers.dense(z, 8 * 8 * filter_size) # Reshape it to start the convolutional stack x = tf.reshape(x, (-1, 8, 8, filter_size)) x = tf.maximum(self.alpha * x, x) x = tf.layers.conv2d_transpose(x, filter_size // 2, 5, strides=1, padding='same') x = tf.layers.batch_normalization(x, training=is_training) x = tf.maximum(self.alpha * x, x) filter_size = filter_size // 4 # 32 // 8 = srt(4) => 2 => (8) -> 16 -> 32 # 64 // 8 = srt(8) => 3 => (8) -> 16 -> 32 -> 64 # 128 // 8 = srt(16) => 4 => (8) -> 16 -> 32 -> 64 -> 128 for i in range(int(math.sqrt(self.image_size // 8))): filter_size = filter_size // 2 x = tf.layers.conv2d_transpose(x, filter_size, 5, strides=2, padding='same') x = tf.layers.batch_normalization(x, training=is_training) x = tf.maximum(self.alpha * x, x) print_info("======>out: {}".format(x)) # Output layer logits = tf.layers.conv2d_transpose(x, out_channel_dim, 5, strides=1, padding='same') # 28x28x3 now # print(logits)3 out = tf.tanh(logits) print_info("======>out: {}".format(out)) return out
def read_pickle(self, file_name): file_path = os.path.join(self.dataset_dir, file_name) if os.path.exists(file_path): print_info("Reading the pickle file {}...".format(file_path)) with open(file_path, 'rb') as f: data = pickle.load(f) return data else: return None
def get_padded_data(self, file_name): file_path = os.path.join(self.EXPERIMENT_ROOT_DIR, file_name) if os.path.exists(file_path): print_info("Reading the padded data...") with open(file_path, 'rb') as f: data = pickle.load(f) return data else: return None
def get_dataset(): dataset = tf.data.Dataset.from_tensor_slices(( {self.FEATURE_1_NAME: in_data_features, self.FEATURE_2_NAME: voice_activity_detection_data_features}, np.ones_like(in_data_features) )) dataset = dataset.batch(batch_size=1) print_info(dataset.output_shapes) return dataset
def _discriminator(self, images, reuse=False): """ Create the _discriminator network :param image: Tensor of input image(s) :param reuse: Boolean if the weights should be reused :return: Tuple of (tensor output of the _discriminator, tensor logits of the _discriminator) """ with tf.variable_scope('_discriminator', reuse=reuse): # Input layer consider ?x32x32x3 x1 = tf.layers.conv2d( images, 64, 5, strides=2, padding='same', kernel_initializer=tf.random_normal_initializer(stddev=0.02)) relu1 = tf.maximum(0.02 * x1, x1) relu1 = tf.layers.dropout(relu1, rate=0.5) # 16x16x64 x2 = tf.layers.conv2d( relu1, 128, 5, strides=2, padding='same', kernel_initializer=tf.random_normal_initializer(stddev=0.02)) bn2 = tf.layers.batch_normalization(x2, training=True) relu2 = tf.maximum(0.02 * bn2, bn2) relu2 = tf.layers.dropout(relu2, rate=0.5) # 8x8x128 x3 = tf.layers.conv2d( relu2, 256, 5, strides=2, padding='same', kernel_initializer=tf.random_normal_initializer(stddev=0.02)) bn3 = tf.layers.batch_normalization(x3, training=True) relu3 = tf.maximum(0.02 * bn3, bn3) relu3 = tf.layers.dropout(relu3, rate=0.5) # 4x4x256 # Flatten it flat = tf.reshape(relu3, (-1, 4 * 4 * 256)) logits = tf.layers.dense(flat, 1) # print(logits) out = tf.sigmoid(logits) # print('_discriminator out: ', out) print_info("======> _discriminator out: {}".format(out)) return out, logits
def image_annotations(path_to_tensorflow_model, category_index, images_src, images_dest): def get_box_dims(box, image_shape): ymin, xmin, ymax, xmax = box im_width, im_height, im_depth = image_shape ymin, xmin, ymax, xmax = map(int, (xmin * im_width, xmax * im_width, ymin * im_height, ymax * im_height)) return (ymax, xmax, ymin, xmin) detection_graph = tf.Graph() with detection_graph.as_default(): od_graph_def = tf.GraphDef() with tf.gfile.GFile(path_to_tensorflow_model, 'rb') as fid: serialized_graph = fid.read() od_graph_def.ParseFromString(serialized_graph) tf.import_graph_def(od_graph_def, name='') sess = tf.Session(graph=detection_graph) image_tensor = detection_graph.get_tensor_by_name('image_tensor:0') detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0') detection_scores = detection_graph.get_tensor_by_name('detection_scores:0') detection_classes = detection_graph.get_tensor_by_name('detection_classes:0') num_detections = detection_graph.get_tensor_by_name('num_detections:0') bag = [] for image_path in tqdm(glob(images_src+"/*")): print_info("Processing {}".format(image_path)) image = plt.imread(image_path) image_expanded = np.expand_dims(image, axis=0) # Perform the actual detection by running the model with the image as input (boxes, scores, classes, num) = sess.run( [detection_boxes, detection_scores, detection_classes, num_detections], feed_dict={image_tensor: image_expanded}) mask = scores > 0.3 image_shape = image.shape coords = list(map(lambda x: get_box_dims(x, image_shape), boxes[mask].tolist())) tags = list(map(lambda x: category_index[int(x)]['name'], classes[mask].tolist())) scores = scores[mask].tolist() bag.append({'image_loc': image_path, 'dest': images_dest, 'coords': coords, 'tags': tags, 'scores': scores}) # pprint(bag) return bag
def parallel_convert(self): print_info("Running OCR : {}".format(self._image_dir)) with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: image_list = glob.glob(self._image_dir + os.sep + "*/*.jpg") image_list.extend(glob.glob(self._image_dir + os.sep + "*/*.jpeg")) image_list.extend(glob.glob(self._image_dir + os.sep + "*/*.png")) # print_info(image_list) try: for img_path, out_file in zip( image_list, executor.map(self.convert, image_list)): print(img_path, ',', out_file, ', processed') except: pass
def _get_test_input_function(self): """ Inheriting class must implement this :return: callable """ dataset = tf.data.TFRecordDataset( glob.glob( os.path.join(self._dataset.TEST_OUT_PATH, "tfrecords/*.tfrecord")), num_parallel_reads=self._hparams.num_threads) # Map the generator output as features as a dict and labels dataset = dataset.map(self.decode) dataset = dataset.batch(batch_size=self._hparams.batch_size, drop_remainder=True) dataset = dataset.prefetch(self._hparams.prefetch_size) print_info("Dataset output sizes are: ") print_info(dataset.output_shapes) return dataset
def _build(self, features, labels, params, mode, config=None): images = features[self.FEATURE_NAME] shape = images.get_shape() assert (len(shape) == 4) batch = shape[0].value rows = shape[1].value cols = shape[2].value channel = shape[3].value print_info("{} {} {}".format(batch, rows, cols)) # Loss, training and eval operations are not needed during inference. loss = None optimizer = None eval_metric_ops = {} logits = self._build_layers(features=images, mode=mode) predicted_class = self._get_predicted_classes(logits=logits) predicted_probabilities = self._get_class_probabilities(logits=logits) # top_k = self._get_top_k_predictions(logits=logits) predictions = { "classes": predicted_class, "probabilities": predicted_probabilities, "logits": logits } if mode != tf.estimator.ModeKeys.PREDICT: # labels = tf.reshape(labels, shape=(-1, self._out_dim), name="labels") tf.logging.info('labels: -----> {}'.format(labels)) loss = self._get_loss(labels=labels, logits=logits) optimizer = self._get_optimizer(loss) eval_metric_ops = self._get_eval_metrics(logits=logits, labels=labels) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=optimizer, eval_metric_ops=eval_metric_ops)
def _get_predict_single_input_function(self, data): train_sentences, train_char_ids, train_ner_tags = None, None, None train_sentences, train_char_ids, train_ner_tags = \ self._make_seq_pair_text(sentence=data, char_2_id_map=self.CHAR_2_ID_MAP, use_char_embd=self._use_char_embd) # print_error(train_char_ids) # print_info(train_ner_tags) if self._use_char_embd: dataset = tf.data.Dataset.from_tensor_slices(({self.FEATURE_1_NAME: train_sentences, self.FEATURE_2_NAME: train_char_ids}, np.zeros(1))) else: dataset = tf.data.Dataset.from_tensor_slices(({self.FEATURE_1_NAME: train_sentences}, np.zeros(1))) dataset = dataset.batch(batch_size=self._batch_size) print_info("Dataset output sizes are: ") print_info(dataset.output_shapes) return dataset
def _get_train_input_fn(self): """ Inheriting class must implement this :return: dataset """ # TF dataset APIs dataset = tf.data.TFRecordDataset( glob.glob( os.path.join(self._dataset.TRAIN_OUT_PATH, "tfrecords/*.tfrecord")), num_parallel_reads=self._hparams.num_threads) # Map the generator output as features as a dict and labels dataset = dataset.map(self.decode) dataset = dataset.batch(batch_size=self._hparams.batch_size, drop_remainder=True) dataset = dataset.prefetch(self._hparams.prefetch_size) # dataset = dataset.cache(filename=os.path.join(self.iterator_dir, "train_data_cache")) print_info("Dataset output sizes are: ") print_info(dataset.output_shapes) return dataset
def convert_pdf(self, pdf_path): """ Reference: https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/ :param pdf_path: :return: """ tool = pyocr.get_available_tools()[0] lang = tool.get_available_languages()[0] print_info(tool.get_available_languages()) pdf_path = os.path.normpath(pdf_path) file_name = pdf_path.split(os.sep)[-1].split(".")[0] # with Image(filename=pdf_path, resolution=300) as img: # img.compression_quality = 99 # img.save(filename=os.path.join(self._image_dir,file_name)) req_image = [] final_text = [] text_file_path = "" image_pdf = Image(filename=pdf_path, resolution=300) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for i, img in tqdm(enumerate(req_image)): text = tool.image_to_string(PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder()) text_file_path = os.path.join(self._text_out_dir, file_name + str(i) + ".txt") with open(text_file_path, "w") as fd: fd.write("%s" % text) return text_file_path
def _create_target_directories(self): """ To setup destination folders structure if not present. :return: """ if os.path.exists(self.PREPROCESSED_DATA_OUT_DIR): if self._over_write: print_info("Deleting data folder: {}".format( self.PREPROCESSED_DATA_OUT_DIR)) shutil.rmtree(self.PREPROCESSED_DATA_OUT_DIR) print_info("Recreating data folder: {}".format( self.PREPROCESSED_DATA_OUT_DIR)) os.makedirs(self.PREPROCESSED_DATA_OUT_DIR) else: print_info( "Skipping preprocessing step, since the data might already be available" ) else: print_info("Creating data folder: {}".format( self.PREPROCESSED_DATA_OUT_DIR)) os.makedirs(self.PREPROCESSED_DATA_OUT_DIR)
def copy(self, in_path, out_dir): path, file_name = os.path.split(in_path) if not os.path.exists(out_dir): os.makedirs(out_dir) print_info("Copying the file {} to {}".format(in_path, out_dir)) shutil.copy(src=in_path, dst=out_dir) else: if not os.path.exists(os.path.join(out_dir, file_name)): print_info("Copying the file {} to {}".format( in_path, out_dir)) shutil.copy(src=in_path, dst=out_dir) else: print_info("Found previous copy @ {}".format( os.path.join(out_dir, file_name)))
def preprocess_prepare(self): if not os.path.exists(os.path.join(self.TRAIN_OUT_PATH, "clips")): self._initialize_spark() self._extract_clips(os.path.join(self.TRAIN_IN_PATH, "sph"), os.path.join(self.TRAIN_OUT_PATH, "clips")) if not os.path.exists(os.path.join(self.VAL_OUT_PATH, "clips")): self._initialize_spark() self._extract_clips(os.path.join(self.VAL_IN_PATH, "sph"), os.path.join(self.VAL_OUT_PATH, "clips")) if not os.path.exists(os.path.join(self.TEST_OUT_PATH, "clips")): self._initialize_spark() self._extract_clips(os.path.join(self.TEST_IN_PATH, "sph"), os.path.join(self.TEST_OUT_PATH, "clips")) self._prepare_wav_pairs() if not os.path.exists(os.path.join(self.TRAIN_OUT_PATH, "tfrecords")): self._initialize_spark() print_info("Processing {} wav pairs, have a break...".format( len(self.TRAIN_WAV_PAIR))) self._generate_mix_speeches( self.TRAIN_WAV_PAIR, os.path.join(self.TRAIN_OUT_PATH, "tfrecords")) if not os.path.exists(os.path.join(self.VAL_OUT_PATH, "tfrecords")): self._initialize_spark() print_info("Processing {} wav pairs, have a break...".format( len(self.VAL_WAV_PAIR))) self._generate_mix_speeches( self.VAL_WAV_PAIR, os.path.join(self.VAL_OUT_PATH, "tfrecords")) if not os.path.exists(os.path.join(self.TEST_OUT_PATH, "tfrecords")): self._initialize_spark() print_info("Processing {} wav pairs, have a break...".format( len(self.TEST_WAV_PAIR))) self._generate_mix_speeches( self.TEST_WAV_PAIR, os.path.join(self.TEST_OUT_PATH, "tfrecords"))
def _extract_vocab(self): """ Uses the preprocessed data from the configured location and extracts the word and character level vocab. :return: """ if not os.path.exists(self.WORDS_VOCAB_FILE) \ or not os.path.exists(self.ENTITY_VOCAB_FILE) \ or not os.path.exists(self.CHARS_VOCAB_FILE): print_info("Preparing the vocab for the text col: {}".format(self._text_col)) lines = set() entities = set() for df_file in tqdm(os.listdir(self.TRAIN_FILES_IN_PATH), desc="mergining lines"): df_file = os.path.join(self.TRAIN_FILES_IN_PATH, df_file) if df_file.endswith(".csv"): df = pd.read_csv(df_file, sep=self._in_seperator,quoting= csv.QUOTE_NONE )#.fillna(SpecialTokens.UNK_WORD) df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) else: raise RuntimeError # print(df["0,1,2,3"]) lines.update(set(df[self._text_col].values.tolist())) entities.update(set(df[self._entity_col].values.tolist())) self.WORD_VOCAB_SIZE, words_vocab = naive_vocab_creater(lines=lines, out_file_name=self.WORDS_VOCAB_FILE, use_nlp=True) print_info("Preparing the character vocab for the text col: {}".format(self._text_col)) # Get char level vocab char_vocab = [SpecialTokens.PAD_CHAR, SpecialTokens.UNK_CHAR] _vocab = get_char_vocab(words_vocab) char_vocab.extend(_vocab) # Create char2id map self.CHAR_2_ID_MAP = vocab_to_tsv(vocab_list=char_vocab, out_file_name=self.CHARS_VOCAB_FILE) self.CHAR_VOCAB_SIZE = len(self.CHAR_2_ID_MAP) print_info("Preparing the vocab for the entity col: {}".format(self._entity_col)) # NUM_TAGS, tags_vocab = tf_vocab_processor(lines, ENTITY_VOCAB_FILE) self.NUM_TAGS, tags_vocab = naive_vocab_creater(lines=entities, out_file_name=self.ENTITY_VOCAB_FILE, use_nlp=False) else: print_info("Reusing the vocab") self.WORD_VOCAB_SIZE, words_vocab = naive_vocab_creater(lines=None, out_file_name=self.WORDS_VOCAB_FILE, use_nlp=None) self.CHAR_2_ID_MAP = vocab_to_tsv(out_file_name=self.CHARS_VOCAB_FILE, vocab_list=None) self.CHAR_VOCAB_SIZE = len(self.CHAR_2_ID_MAP) self.NUM_TAGS, tags_vocab = naive_vocab_creater(lines=None, out_file_name=self.ENTITY_VOCAB_FILE, use_nlp=False) self.TAGS_2_ID = {id_num: tag for id_num, tag in enumerate(tags_vocab)}
def generate_features(self, wav_file_1, wav_file_2): try: start = time.time() speech_1, _ = librosa.core.load(wav_file_1, sr=self._hparams.sampling_rate) # amp factor between -3 dB - 3 dB fac = np.random.rand(1)[0] * 6 - 3 speech_1 = 10.**(fac / 20) * speech_1 speech_2, _ = librosa.core.load(wav_file_2, sr=self._hparams.sampling_rate) fac = np.random.rand(1)[0] * 6 - 3 speech_2 = 10.**(fac / 20) * speech_2 # mix length = min(len(speech_1), len(speech_2)) speech_1 = speech_1[:length] speech_2 = speech_2[:length] speech_mix = speech_1 + speech_2 # compute log spectrum for 1st speaker speech_1_features = np.abs( stft(speech_1, self._hparams.frame_size)[:, :self._hparams.neff]) speech_1_features = np.maximum( speech_1_features, np.max(speech_1_features) / self._hparams.min_amp) speech_1_features = 20. * np.log10( speech_1_features * self._hparams.amp_fac) # same for the 2nd speaker speech_2_features = np.abs( stft(speech_2, self._hparams.frame_size)[:, :self._hparams.neff]) speech_2_features = np.maximum( speech_2_features, np.max(speech_2_features) / self._hparams.min_amp) speech_2_features = 20. * np.log10( speech_2_features * self._hparams.amp_fac) # same for the mixture speech_mix_spec0 = stft( speech_mix, self._hparams.frame_size)[:, :self._hparams.neff] speech_mix_features = np.abs(speech_mix_spec0) # speech_phase = speech_mix_spec0 / speech_mix_spec speech_mix_features = np.maximum( speech_mix_features, np.max(speech_mix_features) / self._hparams.min_amp) speech_mix_features = 20. * np.log10( speech_mix_features * self._hparams.amp_fac) max_mag = np.max(speech_mix_features) # if np.isnan(max_mag): # import ipdb; ipdb.set_trace() speech_VAD = (speech_mix_features > (max_mag - self._hparams.threshold)).astype(int) speech_mix_features = ( speech_mix_features - self._hparams.global_mean) / self._hparams.global_std #The ideal binary mask gives ownership of a time-frequency bin to the source whose magnitude is # maximum among all sources in that bin. # The mask values were assigned with 1 for active and 0 otherwise (binary), # making Y x Y^T as the ideal affinity matrix for the mixture. Y = np.array([ speech_1_features > speech_2_features, speech_1_features < speech_2_features ]).astype('bool') Y = np.transpose(Y, [1, 2, 0]).astype('bool') # speech_mix_features = speech_mix_features[0:self._hparams.dummy_slicing_dim, :] # speech_VAD = speech_VAD[0:self._hparams.dummy_slicing_dim, :] # Y = Y[0:self._hparams.dummy_slicing_dim, :, :] # print_info("{} vs {}".format(wav_file_1, wav_file_2)) end = time.time() print_info("Thread name: {} : took {}".format( threading.currentThread().getName(), end - start)) if speech_mix_features.shape[0] != 1247 or speech_VAD.shape[ 0] != 1247 or Y.shape[0] != 1247: raise Exception("Found files with improper duration/data") return speech_mix_features.astype('float32'), speech_VAD.astype( 'bool'), Y.astype('bool') except Exception as e: print_warn(e) print_error("{} vs {}".format(wav_file_1, wav_file_2)) return np.random.random((self._hparams.dummy_slicing_dim,129)).astype('float32'), \ np.empty((self._hparams.dummy_slicing_dim,129), dtype="bool"), \ np.empty((self._hparams.dummy_slicing_dim,129, 2), dtype="bool")
def discriminator(self, x, out_channel_dim, is_training=True, reuse=False): # It must be Auto-Encoder style architecture # Architecture : (64)4c2s-FC32_BR-FC64*14*14_BR-(1)4dc2s_S with tf.variable_scope("namespace_discriminator", reuse=reuse): # net = tf.nn.relu(conv2d(x, 64, 4, 4, 2, 2, name='d_conv1')) net = tf.layers.conv2d( x, 64, 4, strides=2, padding='same', kernel_initializer=tf.random_normal_initializer(stddev=0.02), name='d_conv1') net = tf.nn.relu(net) tf.logging.info("======> net: {}".format(net)) print_error("net1: {} ".format(net)) size = (self.image_size // 2) net = tf.reshape( net, [self._data_iterator.batch_size, size * size * 64]) # code = tf.nn.relu(bn(linear(net, 32, scope='d_fc6'), is_training=is_training, scope='d_bn6')) code = tf.contrib.layers.fully_connected(inputs=net, num_outputs=32, scope="d_fc6") code = tf.contrib.layers.batch_norm(code, decay=0.9, updates_collections=None, epsilon=1e-5, scale=True, is_training=is_training, scope='d_bn6') code = tf.nn.relu(code) print_error("code: {} ".format(code)) # net = tf.nn.relu(bn(linear(code, 64 * 14 * 14, scope='d_fc3'), is_training=is_training, scope='d_bn3')) size = (self.image_size // 2) net = tf.contrib.layers.fully_connected(inputs=code, num_outputs=64 * size * size, scope="d_fc3") net = tf.contrib.layers.batch_norm(net, decay=0.9, updates_collections=None, epsilon=1e-5, scale=True, is_training=is_training, scope='d_bn3') print_error("net: {} ".format(net)) print_error(net) size = (self.image_size // 2) net = tf.reshape(net, [self._data_iterator.batch_size, size, size, 64]) print_error(net) # out = tf.nn.sigmoid(deconv2d(net, [self.gan_config.batch_size, 28, 28, 1], 4, 4, 2, 2, name='d_dc5')) net = tf.layers.conv2d_transpose(net, out_channel_dim, 4, strides=2, padding='same', name='d_dc5') out = tf.nn.sigmoid(net) print_info("==================================") print_info(out) print_info(x) # recon loss recon_error = tf.sqrt( 2 * tf.nn.l2_loss(out - x)) / self._data_iterator.batch_size print_info("==================================") print_error(recon_error) return out, recon_error, code
def store_as_pickle(self, data, file_name): file_path = os.path.join(self.dataset_dir, file_name) print_info("Writing the pickle file {}...".format(file_path)) with open(file_path, 'wb') as f: pickle.dump(data, f) return None
def store_padded_data(self, file_name, data): file_path = os.path.join(self.EXPERIMENT_ROOT_DIR, file_name) print_info("Writing the padded data...") with open(file_path, 'wb') as f: pickle.dump(data, f) return None
def visulaize(self, executor, file_path): """ :param executor: :param test_file_path: :return: """ estimator = executor.estimator in_data_features, voice_activity_detection_data_features, phase_features = self._get_predict_samples( file_path=file_path) in_data_features = np.asarray(in_data_features) voice_activity_detection_data_features = np.asarray( voice_activity_detection_data_features) N_frames = in_data_features.shape[0] hop_size = self._hparams.frame_size // 4 def get_dataset(): dataset = tf.data.Dataset.from_tensor_slices(({ self.FEATURE_1_NAME: in_data_features, self.FEATURE_2_NAME: voice_activity_detection_data_features }, np.ones_like(in_data_features))) dataset = dataset.batch(batch_size=1) print_info(dataset.output_shapes) return dataset predict_fn = estimator.predict(input_fn=lambda: get_dataset()) print_info("Shape of in data: {}".format(in_data_features.shape)) print_info("Number of frames for given file: {}".format(N_frames)) embeddings = [] i = 0 for predicted_value in predict_fn: # print("i = {}".format(i)) """ TODO: strange behaviour! 1 wav file = N samples Eg: N = 600 FramesPerSample=100, BatchSize = 1, NEFF = 129, EMD_K = 30 For each sample the embeddings is of shape [batch_size * frames_per_sample, NEFF, embd_dim]. For prediction batch size is made 1. Hence the embeddings colapse to [frames_per_sample, NEFF, embd_dim] 1 sample predictions will have `frames_per_sample` outputs Eg: If input audio file has 75 frames, the prediction will have [7500, NEFF, embd_dim] """ embeddings.append(predicted_value) i += 1 print_info("Number of embeddings predicted for given file: {}".format( len(embeddings))) print_error(np.asarray(embeddings).shape) N_assign = 0 step = 0 for frame_i in tqdm(range(N_frames)): # expand the dimesion to be inline with TF batch size in_data_np = np.expand_dims(in_data_features[frame_i], axis=0) in_phase_np = np.expand_dims(phase_features[frame_i], axis=0) voice_activity_detection_data_np = np.expand_dims( voice_activity_detection_data_features[frame_i], axis=0) embedding_np = np.asarray( embeddings[frame_i:frame_i + self._hparams.frames_per_sample]) # ---------------------------------------------- embedding_ac = [] for i, j in itertools.product( range(self._hparams.frames_per_sample), range(self._hparams.neff)): if voice_activity_detection_data_np[0, i, j] == 1: embedding_ac.append(embedding_np[i, j, :]) kmean = KMeans(n_clusters=2, random_state=0).fit(embedding_ac) # visualization using 3 PCA pca_Data = PCA(n_components=3).fit_transform(embedding_ac) fig = plt.figure(1, figsize=(8, 6)) ax = Axes3D(fig, elev=-150, azim=110) # ax.scatter(pca_Data[:, 0], pca_Data[:, 1], pca_Data[:, 2], # c=kmean.labels_, cmap=plt.cm.Paired) ax.scatter(pca_Data[:, 0], pca_Data[:, 1], pca_Data[:, 2], cmap=plt.cm.Paired) ax.set_title('Embedding visualization using the first 3 PCs') ax.set_xlabel('1st pc') ax.set_ylabel('2nd pc') ax.set_zlabel('3rd pc') if not os.path.exists("vis"): os.makedirs("vis") plt.savefig('vis/' + str(step) + 'pca.jpg') step += 1
def convert(self, path): print_info(path) if path.endswith("pdf"): return self.convert_pdf(pdf_path=path) else: return self.convert_image(image_path=path)