Beispiel #1
0
    def __init__(self, configs):
        tag_id_file = configs.get('tag_id_file', None)
        model_pb = configs.get('model_pb', None)
        extract_text = configs.get('extract_text', False)
        if tag_id_file is None:
            raise
        else:
            self.label_name_dict = get_label_name_dict(tag_id_file, None)
        if model_pb is None:
            raise
        else:
            config = tf.ConfigProto(allow_soft_placement=True)
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            meta_graph_def = tf.saved_model.loader.load(
                self.sess, [tf.saved_model.tag_constants.SERVING], model_pb)
            signature_def = meta_graph_def.signature_def
            self.signature = signature_def[tf.saved_model.signature_constants.
                                           DEFAULT_SERVING_SIGNATURE_DEF_KEY]

        batch_size = configs.get('video_feats_extractor_batch_size', 8)
        imgfeat_extractor = configs.get('imgfeat_extractor', 'Youtube8M')
        self.feat_extractor = MultiModalFeatureExtract(
            batch_size=batch_size,
            imgfeat_extractor=imgfeat_extractor,
            extract_video=True,
            extract_audio=True,
            extract_text=extract_text)
    def __init__(self, configs):
        tag_id_file = configs.get('tag_id_file', None)  # 返回指定键的值 equals to   tag_id_file = config['tag_id_file']
        model_pb = configs.get('model_pb', None)
        if tag_id_file is None:
            raise  # 手动异常  报错Error
        else:
            self.label_name_dict = get_label_name_dict(tag_id_file, None)  # 返回到一个字典  label_name_dict:{'label': int(id)}
        if model_pb is None:
            raise
        else:  # 产生一个sess变量和一个signature变量
            config = tf.ConfigProto(allow_soft_placement=True)  # 允许tensorflow自动分配设备
            config.gpu_options.allow_growth = True  # 用于动态申请显存,从少到多慢慢增加gpu容量
            self.sess = tf.Session(config=config)  # tf.Session 三个参数:target graph config
            meta_graph_def = tf.saved_model.loader.load(self.sess, [tf.saved_model.tag_constants.SERVING], model_pb)  # 加载一个保存的模型
            signature_def = meta_graph_def.signature_def  # A dictionary mapping signature names to functions.
            self.signature = signature_def[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]  # key: tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY

        batch_size = configs.get('video_feats_extractor_batch_size', 8)  # 默认值是8 actually新创建Key: video_feats_extractor_batch_size  提取特征
        imgfeat_extractor = configs.get('imgfeat_extractor', 'Youtube8M')  # 默认是string: 'Youtube8M'  as  图像特征提取器
        self.feat_extractor = MultiModalFeatureExtract(batch_size=batch_size, imgfeat_extractor= imgfeat_extractor, 
                             extract_video = True, extract_audio = True, extract_text = False)  # 特征提取器 as a class
Beispiel #3
0
    parser.add_argument('--datafile_path', default='../dataset/datafile.txt')

    parser.add_argument('--extract_type', default=1,
                        type=int)  #0:ALL #1:VIDEO #2: AUDIO #3: TEXT

    parser.add_argument('--image_batch_size', default=25, type=int)
    parser.add_argument('--imgfeat_extractor', default='ViT', type=str)

    args = parser.parse_args()
    os.makedirs(args.frame_npy_folder, exist_ok=True)
    os.makedirs(args.audio_npy_folder, exist_ok=True)
    os.makedirs(args.text_txt_folder, exist_ok=True)
    gen = MultiModalFeatureExtract(batch_size=args.image_batch_size,
                                   imgfeat_extractor=args.imgfeat_extractor,
                                   extract_video=args.extract_type == 0
                                   or args.extract_type == 1,
                                   extract_audio=args.extract_type == 0
                                   or args.extract_type == 2,
                                   extract_text=args.extract_type == 0
                                   or args.extract_type == 3)

    def process_file(file_path, frame_npy_path, audio_npy_path, text_txt_path):
        if not os.path.exists(file_path):
            return
        print(file_path)
        gen.extract_feat(file_path, frame_npy_path, audio_npy_path,
                         text_txt_path)

    file_paths = glob.glob(args.test_files_dir + '/*.' + args.postfix)
    random.shuffle(file_paths)
    print('start extract feats')
    for file_path in tqdm.tqdm(file_paths, total=len(file_paths)):
Beispiel #4
0
class TaggingModel():
    def __init__(self, configs):
        tag_id_file = configs.get('tag_id_file', None)
        model_pb = configs.get('model_pb', None)
        extract_text = configs.get('extract_text', False)
        if tag_id_file is None:
            raise
        else:
            self.label_name_dict = get_label_name_dict(tag_id_file, None)
        if model_pb is None:
            raise
        else:
            config = tf.ConfigProto(allow_soft_placement=True)
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            meta_graph_def = tf.saved_model.loader.load(
                self.sess, [tf.saved_model.tag_constants.SERVING], model_pb)
            signature_def = meta_graph_def.signature_def
            self.signature = signature_def[tf.saved_model.signature_constants.
                                           DEFAULT_SERVING_SIGNATURE_DEF_KEY]

        batch_size = configs.get('video_feats_extractor_batch_size', 8)
        imgfeat_extractor = configs.get('imgfeat_extractor', 'Youtube8M')
        self.feat_extractor = MultiModalFeatureExtract(
            batch_size=batch_size,
            imgfeat_extractor=imgfeat_extractor,
            extract_video=True,
            extract_audio=True,
            extract_text=extract_text)

    def image_preprocess(self, image, rescale=224):
        #resize to 224 and normlize to 0-1, then perform f(x)= 2*(x-0.5)
        if isinstance(image, type(None)):
            print("WARNING: test input image is None")
            return np.zeros((rescale, rescale, 3))
        if image.shape[0] != rescale:
            image = cv2.resize(image, (rescale, rescale))
        image = 2 * (image / (np.max(image) + 1e-10) - 0.5)
        return image

    def text_preprocess(self, txt, max_len=128):
        tokens = ['[CLS]'] + tokokenizer.tokenize(txt)
        ids = tokokenizer.convert_tokens_to_ids(tokens)
        ids = ids[:max_len]
        ids = ids + [0] * (max_len - len(ids))
        return ids

    def preprocess(self, feat_dict, max_frame_num=300):
        ret_dict = {}
        for feat_type in feat_dict:
            if feat_type == 'video':
                feats = np.zeros((max_frame_num, len(feat_dict['video'][0])))
                valid_num = min(max_frame_num, len(feat_dict['video']))
                feats[:valid_num] = feat_dict['video']
            elif feat_type == 'audio':
                feats = np.zeros((max_frame_num, len(feat_dict['audio'][0])))
                valid_num = min(max_frame_num, len(feat_dict['audio']))
                feats[:valid_num] = feat_dict['audio']
            elif feat_type == 'text':
                feats = self.text_preprocess(feat_dict['text'], 128)
            elif feat_type == 'image':
                feats = self.image_preprocess(feat_dict['image'])
            else:
                raise
            ret_dict[feat_type] = feats
        return ret_dict

    def inference(self, test_file):
        with self.sess.as_default() as sess:
            start_time = time.time()
            feat_dict = self.feat_extractor.extract_feat(test_file, save=False)
            end_time = time.time()
            print("feature extract cost time: {} sec".format(end_time -
                                                             start_time))
            if 'text' in feat_dict:
                print(feat_dict['text'])
            else:
                feat_dict['text'] = ""

            feat_dict_preprocess = self.preprocess(feat_dict)
            feed_dict = {}

            # Get input tensor.
            for key in feat_dict:
                if key in self.signature.inputs:
                    feed_dict[self.signature.inputs[key].name] = [
                        feat_dict_preprocess[key]
                    ]

            if 'video_frames_num' in self.signature.inputs:
                feed_dict[self.signature.inputs['video_frames_num'].name] = [
                    len(feat_dict['video'])
                ]
            if 'audio_frames_num' in self.signature.inputs:
                feed_dict[self.signature.inputs['audio_frames_num'].name] = [
                    len(feat_dict['audio'])
                ]

            # Get output tensor.
            class_indexes = self.signature.outputs['class_indexes'].name
            predictions = self.signature.outputs['predictions'].name
            #video_embedding = self.signature.outputs['video_embedding'].name #(Optional)

            start_time = time.time()
            class_indexes, predictions = sess.run([class_indexes, predictions],
                                                  feed_dict)
            end_time = time.time()

            print("multi-modal tagging model forward cost time: {} sec".format(
                end_time - start_time))

            labels = [
                self.label_name_dict[index] for index in class_indexes[0]
            ]
            scores = predictions[0]

        return labels, scores
class TaggingModel():  # 输入参数是config字典
    def __init__(self, configs):
        tag_id_file = configs.get('tag_id_file', None)  # 返回指定键的值 equals to   tag_id_file = config['tag_id_file']
        model_pb = configs.get('model_pb', None)
        if tag_id_file is None:
            raise  # 手动异常  报错Error
        else:
            self.label_name_dict = get_label_name_dict(tag_id_file, None)  # 返回到一个字典  label_name_dict:{'label': int(id)}
        if model_pb is None:
            raise
        else:  # 产生一个sess变量和一个signature变量
            config = tf.ConfigProto(allow_soft_placement=True)  # 允许tensorflow自动分配设备
            config.gpu_options.allow_growth = True  # 用于动态申请显存,从少到多慢慢增加gpu容量
            self.sess = tf.Session(config=config)  # tf.Session 三个参数:target graph config
            meta_graph_def = tf.saved_model.loader.load(self.sess, [tf.saved_model.tag_constants.SERVING], model_pb)  # 加载一个保存的模型
            signature_def = meta_graph_def.signature_def  # A dictionary mapping signature names to functions.
            self.signature = signature_def[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]  # key: tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY

        batch_size = configs.get('video_feats_extractor_batch_size', 8)  # 默认值是8 actually新创建Key: video_feats_extractor_batch_size  提取特征
        imgfeat_extractor = configs.get('imgfeat_extractor', 'Youtube8M')  # 默认是string: 'Youtube8M'  as  图像特征提取器
        self.feat_extractor = MultiModalFeatureExtract(batch_size=batch_size, imgfeat_extractor= imgfeat_extractor, 
                             extract_video = True, extract_audio = True, extract_text = False)  # 特征提取器 as a class

    def image_preprocess(self, image, rescale=224):
        #resize to 224 and normlize to 0-1, then perform f(x)= 2*(x-0.5)
        if isinstance(image, type(None)):
          print("WARNING: test input image is None")
          return np.zeros((rescale, rescale, 3))
        if image.shape[0] !=rescale:
          image = cv2.resize(image, (rescale, rescale))
        image = 2*(image/(np.max(image)+1e-10) - 0.5)
        return image

    def text_preprocess(self, txt,max_len=128):
        tokens = ['[CLS]'] + tokokenizer.tokenize(txt)
        ids = tokokenizer.convert_tokens_to_ids(tokens)
        ids = ids[:max_len]
        ids = ids + [0]*(max_len-len(ids))
        return ids


    def preprocess(self, feat_dict, max_frame_num=300):
        ret_dict = {}
        for feat_type in feat_dict:
            if feat_type=='video':
                feats = np.zeros((max_frame_num, len(feat_dict['video'][0])))
                valid_num = min(max_frame_num, len(feat_dict['video']))
                feats[:valid_num] = feat_dict['video']
            elif feat_type=='audio':
                feats = np.zeros((max_frame_num, len(feat_dict['audio'][0])))
                valid_num = min(max_frame_num, len(feat_dict['audio']))
                feats[:valid_num] = feat_dict['audio']
            elif feat_type=='text':
                feats = self.text_preprocess(feat_dict['text'], 128)
            elif feat_type == 'image':
                feats = self.image_preprocess(feat_dict['image'])
            else:
                raise
            ret_dict[feat_type] = feats
        return ret_dict

    def load_multimodal_feat(self, test_file, feat_dir):
        assert os.path.exists(feat_dir)
        feat_dict = {}
        # load video feat
        video_id = os.path.basename(test_file).split('.m')[0]
        feat_dict['video'] = np.load(os.path.join(feat_dir,'video_npy' ,'Youtube8M', 'tagging', video_id + '.npy'))
        feat_dict['audio'] = np.load(os.path.join(feat_dir, 'audio_npy', 'Vggish', 'tagging', video_id + '.npy'))
        feat_dict['image'] = cv2.imread(os.path.join(feat_dir, 'image_jpg','tagging', video_id + '.jpg'), 1)
        feat_dict['text'] = open(os.path.join(feat_dir, 'text_txt', 'tagging', video_id + '.txt')).read()
        return feat_dict

    def inference(self, test_file, load_feat=False, feat_dir=None):
        tf.reset_default_graph()
        with self.sess.as_default() as sess:
            if load_feat == False:
                start_time = time.time()
                feat_dict = self.feat_extractor.extract_feat(test_file, save=False)
                end_time = time.time()
                print("feature extract cost time: {} sec".format(end_time -start_time))
            else:
                feat_dict = self.load_multimodal_feat(test_file, feat_dir)

            if 'text' in feat_dict:
#                print(feat_dict['text'])
                pass
            else:
                feat_dict['text'] = ""
            feat_dict_preprocess = self.preprocess(feat_dict)
            feed_dict ={}
            
            # Get input tensor.
            for key in feat_dict:
                if key in self.signature.inputs:
                  feed_dict[self.signature.inputs[key].name] = [feat_dict_preprocess[key]]
                
            if 'video_frames_num' in self.signature.inputs:
                feed_dict[self.signature.inputs['video_frames_num'].name] = [len(feat_dict['video'])]
            if 'audio_frames_num' in self.signature.inputs:
                feed_dict[self.signature.inputs['audio_frames_num'].name] = [len(feat_dict['audio'])]
                
            # Get output tensor.
            class_indexes = self.signature.outputs['class_indexes'].name
            predictions = self.signature.outputs['predictions'].name
            #video_embedding = self.signature.outputs['video_embedding'].name #(Optional)
            
            start_time = time.time()
            class_indexes,predictions = sess.run([class_indexes,predictions], feed_dict)  # 
            end_time = time.time()
            
            print("multi-modal tagging model forward cost time: {} sec".format(end_time -start_time))


            labels=[self.label_name_dict[index] for index in class_indexes[0]]
            scores = predictions[0]

        return labels, scores