def __init__(self): super().__init__() self.sdm = SceneDetectionManager(detector_type='content', threshold=20, save_scenes=True) self.vectorizer = None self.subs_part_len = 10 self.subs_classifier = None self._fitted = False
def __init__(self): super().__init__() self.morph = MorphAnalyzer() self.sdm = SceneDetectionManager(detector_type='content', threshold=20, save_scenes=True) self.buzz_words = {} with open(os.path.join(os.path.dirname(__file__), 'artifacts/buzzwords.txt'), encoding='utf-8') as f: for line in f.readlines(): words = list(map(str.strip, line.split())) if not words: continue common_stem = self.morph.parse(words[0])[0].normal_form for word in words: self.buzz_words[self.morph.parse(word) [0].normal_form] = common_stem self.vectorizer = CountVectorizer() self.vectorizer.vocabulary = set(self.buzz_words.values()) self.subs_part_len = 10 self.subs_classifier = None self._fitted = False
class D2VLSTMBasedModel(BaseAdDetectorModel): def __init__(self): super().__init__() self.sdm = SceneDetectionManager(detector_type='content', threshold=20, save_scenes=True) self.vectorizer = None self.subs_part_len = 10 self.subs_classifier = None self._fitted = False self.window = 3 def save(self, path): if not self._fitted: raise Exception( "Model is not fitted yet. Fit model before saving.") if os.path.exists(path) and not os.path.isdir(path): raise Exception("Path for saving should be directory.") if not os.path.exists(path): os.mkdir(path) with open(os.path.join(path, 'lstm.json'), "w") as f: f.write(self.subs_classifier.to_json()) self.subs_classifier.save_weights(os.path.join(path, 'lstm.weights')) def load(self, path): with open(os.path.join(path, 'lstm.json'), "w") as f: self.subs_classifier = model_from_json(f.read()) self.subs_classifier.load_weights(os.path.join(path, 'lstm.weights')) self._fitted = True def find_ads(self, video_ids): if not isinstance(video_ids, list): video_ids = [video_ids] result = [] for video_id in video_ids: info = VideoInfo(video_id) subs = Subtitles(video_id, preprocess_russian_text_with_morph) scenes = self.sdm.detect_scenes(video_id) r = 0 xs = [] segments = [] for l in range(len(scenes)): while r + 1 < len(scenes) and scenes[r][1] - scenes[l][ 0] < self.subs_part_len: r += 1 words = list( map(str.strip, subs.fulltext(scenes[l][0], scenes[r][1]).split())) x = list(self.vectorizer.infer_vector(words)) x.append((scenes[l][0] + scenes[r][1]) / 2 / info.duration) xs.append(x) segments.append((scenes[l][0], scenes[r][1])) segments = [(0, 0)] * (self.window // 2) + segments + [ (0, 0) ] * (self.window // 2) features = len(xs[0]) xs = [[0] * features] * ( self.window // 2) + xs + [[0] * features] * (self.window // 2) X = [] for i in range(self.window // 2, len(xs) - self.window // 2): X.append(xs[i - self.window // 2:i + self.window // 2 + 1]) X = np.array(X) Y = self.subs_classifier.predict(X) > 0.5 ads = [] for i in range(self.window // 2, len(X) - self.window // 2): cnt = 0 for d in range(-(self.window // 2), self.window // 2 + 1): cnt += Y[i + d][self.window // 2 - d][0] if cnt > self.window // 2: ads.append(segments[i]) merged_ads = [] for ad in ads: if len(merged_ads) == 0 or ad[0] - merged_ads[-1][1] > 10: merged_ads.append(ad) else: merged_ads.append((merged_ads.pop()[0], ad[1])) result.append(merged_ads) return result if len(result) > 1 else result[0] def train(self, markups): video_ids = list(markups.keys()) print('start model training') tagged_data = [] subs_parts = [] Ys = [] ps = [] for idx, video_id in enumerate(video_ids): print('\rprocessing video {} ({}/{})'.format( video_id, idx + 1, len(video_ids)), end='') subs = Subtitles(video_id, preprocess_russian_text_with_morph) scenes = self.sdm.detect_scenes(video_id) r = 0 sp = [] ys = [] p = [] info = VideoInfo(video_id) for l in range(len(scenes)): while r + 1 < len(scenes) and scenes[r][1] - scenes[l][ 0] < self.subs_part_len: r += 1 seg = (scenes[l][0], scenes[r][1]) words = list( map(str.strip, subs.fulltext(scenes[l][0], scenes[r][1]).split())) sp.append(words) tagged_data.append( TaggedDocument(words=words, tags=[str(len(tagged_data))])) p.append((seg[0] + seg[1]) / 2 / info.duration) if self._inside_ad(seg, markups[video_id]): ys.append(1) elif self._intersect_ad(seg, markups[video_id]): ys.append(0.5) else: ys.append(0) subs_parts.append(sp) Ys.append(ys) ps.append(p) self.vectorizer = Doc2Vec(size=50, alpha=0.025, dm=0) self.vectorizer.build_vocab(tagged_data) self.vectorizer.train(tagged_data, total_examples=self.vectorizer.corpus_count, epochs=500) X = [] Y = [] for idx, video_id in enumerate(video_ids): print('\rprocessing video {} ({}/{})'.format( video_id, idx + 1, len(video_ids)), end='') xs = [ self.vectorizer.infer_vector(words) for words in subs_parts[idx] ] xs = list( np.hstack((xs, np.reshape(ps[idx], (len(ps[idx]), 1)))).tolist()) ys = Ys[idx] features = len(xs[0]) xs = [[0] * features] * ( self.window // 2) + xs + [[0] * features] * (self.window // 2) ys = [0] * (self.window // 2) + ys + [0] * (self.window // 2) for i in range(self.window // 2, len(xs) - self.window // 2): X.append(xs[i - self.window // 2:i + self.window // 2 + 1]) Y.append(ys[i - self.window // 2:i + self.window // 2 + 1]) X = np.array(X) Y = np.array(Y) Y = Y.reshape((Y.shape[0], Y.shape[1], 1)) print(X.shape, Y.shape) model = Sequential() model.add( Bidirectional(LSTM(100, return_sequences=True), input_shape=(self.window, X.shape[2]))) model.add(TimeDistributed(Dense(1, activation='sigmoid'))) model.compile(optimizer='adam', loss='binary_crossentropy') np.random.seed(0) model.fit(X, Y, batch_size=len(X), shuffle=True, epochs=500) self.subs_classifier = model self._fitted = True @staticmethod def _inside_ad(seg, ads): for ad in ads: if ad[0] <= seg[0] and seg[1] <= ad[1]: return True return False @staticmethod def _intersect_ad(seg, ads): for ad in ads: if seg[0] < ad[0] < seg[1] or seg[0] < ad[1] < seg[1]: return True return False
class D2VBasedModel(BaseAdDetectorModel): def __init__(self): super().__init__() self.sdm = SceneDetectionManager(detector_type='content', threshold=20, save_scenes=True) self.vectorizer = None self.subs_part_len = 10 self.subs_classifier = None self._fitted = False def save(self, path): if not self._fitted: raise Exception( "Model is not fitted yet. Fit model before saving.") if os.path.exists(path) and not os.path.isdir(path): raise Exception("Path for saving should be directory.") if not os.path.exists(path): os.mkdir(path) with open(os.path.join(path, 'mlp.model'), 'wb') as f: pickle.dump(self.subs_classifier, f) def load(self, path): with open(os.path.join(path, 'mlp.model'), 'rb') as f: self.subs_classifier = pickle.load(f) self._fitted = True def find_ads(self, video_ids): if not self._fitted: raise Exception("Train or load model before inference") if not isinstance(video_ids, list): video_ids = [video_ids] result = [] for video_id in video_ids: info = VideoInfo(video_id) subs = Subtitles(video_id, preprocess_russian_text_with_morph) scenes = self.sdm.detect_scenes(video_id) r = 0 ads = [] for l in range(len(scenes)): while r + 1 < len(scenes) and scenes[r][1] - scenes[l][ 0] < self.subs_part_len: r += 1 words = list( map(str.strip, subs.fulltext(scenes[l][0], scenes[r][1]).split())) x = self.vectorizer.infer_vector(words) x.append(scenes[l][0] / info.duration) y = self.subs_classifier.predict([x])[0] if y == 1: ads.append((scenes[l][0], scenes[r][1])) merged_ads = [] for ad in ads: if len(merged_ads) == 0 or ad[0] - merged_ads[-1][1] > 10: merged_ads.append(ad) else: merged_ads.append((merged_ads.pop()[0], ad[1])) result.append(merged_ads) return result if len(result) > 1 else result[0] def train(self, markups): video_ids = list(markups.keys()) X = [] Y = [] print('start model training') tagged_data = [] subs_parts = [] for idx, video_id in enumerate(video_ids): print('\rprocessing video {} ({}/{})'.format( video_id, idx + 1, len(video_ids)), end='') subs = Subtitles(video_id, preprocess_russian_text_with_morph) scenes = self.sdm.detect_scenes(video_id) r = 0 for l in range(len(scenes)): while r + 1 < len(scenes) and scenes[r][1] - scenes[l][ 0] < self.subs_part_len: r += 1 seg = (scenes[l][0], scenes[r][1]) if self._intersect_ad(seg, markups[video_id]): continue words = list( map(str.strip, subs.fulltext(scenes[l][0], scenes[r][1]).split())) subs_parts.append(words) tagged_data.append( TaggedDocument(words=words, tags=[str(len(tagged_data))])) if self._inside_ad(seg, markups[video_id]): Y.append(1) else: Y.append(0) self.vectorizer = Doc2Vec(size=50, alpha=0.025, dm=0) self.vectorizer.build_vocab(tagged_data) self.vectorizer.train(tagged_data, total_examples=self.vectorizer.corpus_count, epochs=500) X = np.array( [self.vectorizer.infer_vector(words) for words in subs_parts]) Y = np.array(Y) print('X shape: ', X.shape) print('Y shape: ', Y.shape) self.subs_classifier = MLPClassifier(hidden_layer_sizes=(50, 10), solver='adam', random_state=0, learning_rate='adaptive', max_iter=1000) self.subs_classifier.fit(X, Y) self._fitted = True @staticmethod def _inside_ad(seg, ads): for ad in ads: if ad[0] <= seg[0] and seg[1] <= ad[1]: return True return False @staticmethod def _intersect_ad(seg, ads): for ad in ads: if seg[0] < ad[0] < seg[1] or seg[0] < ad[1] < seg[1]: return True return False
class BuzzwordsLSTMBasedModel(BaseAdDetectorModel): def __init__(self): super().__init__() self.morph = MorphAnalyzer() self.sdm = SceneDetectionManager(detector_type='content', threshold=20, save_scenes=True) self.buzz_words = {} with open(os.path.join(os.path.dirname(__file__), 'artifacts/buzzwords.txt'), encoding='utf-8') as f: for line in f.readlines(): words = list(map(str.strip, line.split())) if not words: continue common_stem = self.morph.parse(words[0])[0].normal_form for word in words: self.buzz_words[self.morph.parse(word) [0].normal_form] = common_stem self.vectorizer = CountVectorizer() self.vectorizer.vocabulary = set(self.buzz_words.values()) self.subs_part_len = 10 self.subs_classifier = None self._fitted = False self.window = 3 def save(self, path): if not self._fitted: raise Exception( "Model is not fitted yet. Fit model before saving.") if os.path.exists(path) and not os.path.isdir(path): raise Exception("Path for saving should be directory.") if not os.path.exists(path): os.mkdir(path) with open(os.path.join(path, 'lstm.json'), "w") as f: f.write(self.subs_classifier.to_json()) self.subs_classifier.save_weights(os.path.join(path, 'lstm.weights')) def load(self, path): with open(os.path.join(path, 'lstm.json'), "w") as f: self.subs_classifier = model_from_json(f.read()) self.subs_classifier.load_weights(os.path.join(path, 'lstm.weights')) self._fitted = True def find_ads(self, video_ids): if not self._fitted: raise Exception("Train or load model before inference") if not isinstance(video_ids, list): video_ids = [video_ids] result = [] for video_id in video_ids: info = VideoInfo(video_id) subs = Subtitles(video_id, preprocess_russian_text_with_morph) scenes = self.sdm.detect_scenes(video_id) r = 0 segments = [] xs = [] for l in range(len(scenes)): while r + 1 < len(scenes) and scenes[r][1] - scenes[l][ 0] < self.subs_part_len: r += 1 words = map(str.strip, subs.fulltext(scenes[l][0], scenes[r][1]).split()) words = filter(lambda word: word in self.buzz_words, words) words = map(lambda word: self.buzz_words[word], words) subs_part = ' '.join(words) x = list(self.vectorizer.transform([subs_part]).toarray()[0]) x.append(scenes[l][0] / info.duration) xs.append(x) seg = (scenes[l][0], scenes[r][1]) segments.append(seg) segments = [(0, 0)] * (self.window // 2) + segments + [ (0, 0) ] * (self.window // 2) features = len(xs[0]) xs = [[0] * features] * ( self.window // 2) + xs + [[0] * features] * (self.window // 2) X = [] for i in range(self.window // 2, len(xs) - self.window // 2): X.append(xs[i - self.window // 2:i + self.window // 2 + 1]) X = np.array(X) Y = self.subs_classifier.predict(X) > 0.5 ads = [] for i in range(self.window // 2, len(X) - self.window // 2): cnt = 0 for d in range(-(self.window // 2), self.window // 2 + 1): cnt += Y[i + d][self.window // 2 - d][0] if cnt > self.window // 2: ads.append(segments[i]) merged_ads = [] for ad in ads: if len(merged_ads) == 0 or ad[0] - merged_ads[-1][1] > 10: merged_ads.append(ad) else: merged_ads.append((merged_ads.pop()[0], ad[1])) result.append(merged_ads) return result if len(result) > 1 else result[0] def train(self, markups): video_ids = list(markups.keys()) X = [] Y = [] print('start model training') for idx, video_id in enumerate(video_ids): print('\rprocessing video {} ({}/{})'.format( video_id, idx + 1, len(video_ids)), end='') subs = Subtitles(video_id, preprocess_russian_text_with_morph) scenes = self.sdm.detect_scenes(video_id) r = 0 xs = [] ys = [] for l in range(len(scenes)): while r + 1 < len(scenes) and scenes[r][1] - scenes[l][ 0] < self.subs_part_len: r += 1 words = map(str.strip, subs.fulltext(scenes[l][0], scenes[r][1]).split()) words = filter(lambda word: word in self.buzz_words, words) words = map(lambda word: self.buzz_words[word], words) subs_part = ' '.join(words) x = list(self.vectorizer.transform([subs_part]).toarray()[0]) info = VideoInfo(video_id) x.append(scenes[l][0] / info.duration) seg = (scenes[l][0], scenes[r][1]) xs.append(x) if self._intersect_ad(seg, markups[video_id]): ys.append(0.5) elif self._inside_ad(seg, markups[video_id]): ys.append(1) else: ys.append(0) features = len(xs[0]) xs = [[0] * features] * ( self.window // 2) + xs + [[0] * features] * (self.window // 2) ys = [0] * (self.window // 2) + ys + [0] * (self.window // 2) for i in range(self.window // 2, len(xs) - self.window // 2): X.append(xs[i - self.window // 2:i + self.window // 2 + 1]) Y.append(ys[i - self.window // 2:i + self.window // 2 + 1]) X = np.array(X) Y = np.array(Y) Y = Y.reshape((Y.shape[0], Y.shape[1], 1)) print(X.shape, Y.shape) model = Sequential() model.add( Bidirectional(LSTM(100, return_sequences=True), input_shape=(self.window, X.shape[2]))) model.add(TimeDistributed(Dense(1, activation='sigmoid'))) model.compile(optimizer='adam', loss='binary_crossentropy') np.random.seed(0) model.fit(X, Y, batch_size=len(X), shuffle=True, epochs=300) self.subs_classifier = model self._fitted = True @staticmethod def _inside_ad(seg, ads): for ad in ads: if ad[0] <= seg[0] and seg[1] <= ad[1]: return True return False @staticmethod def _intersect_ad(seg, ads): for ad in ads: if seg[0] < ad[0] < seg[1] or seg[0] < ad[1] < seg[1]: return True return False
class BuzzwordsMLPBasedModel(BaseAdDetectorModel): def __init__(self): super().__init__() self.morph = MorphAnalyzer() self.sdm = SceneDetectionManager(detector_type='content', threshold=20, save_scenes=True) self.buzz_words = {} with open(os.path.join(os.path.dirname(__file__), 'artifacts/buzzwords.txt'), encoding='utf-8') as f: for line in f.readlines(): words = list(map(str.strip, line.split())) if not words: continue common_stem = self.morph.parse(words[0])[0].normal_form for word in words: self.buzz_words[self.morph.parse(word) [0].normal_form] = common_stem self.vectorizer = CountVectorizer() self.vectorizer.vocabulary = set(self.buzz_words.values()) self.subs_part_len = 10 self.subs_classifier = None self._fitted = False def save(self, path): if not self._fitted: raise Exception( "Model is not fitted yet. Fit model before saving.") if os.path.exists(path) and not os.path.isdir(path): raise Exception("Path for saving should be directory.") if not os.path.exists(path): os.mkdir(path) with open(os.path.join(path, 'mlp.model'), 'wb') as f: pickle.dump(self.subs_classifier, f) def load(self, path): with open(os.path.join(path, 'mlp.model'), 'rb') as f: self.subs_classifier = pickle.load(f) self._fitted = True def find_ads(self, video_ids): if not self._fitted: raise Exception("Train or load model before inference") if not isinstance(video_ids, list): video_ids = [video_ids] result = [] for video_id in video_ids: info = VideoInfo(video_id) subs = Subtitles(video_id, preprocess_russian_text_with_morph) scenes = self.sdm.detect_scenes(video_id) r = 0 ads = [] for l in range(len(scenes)): while r + 1 < len(scenes) and scenes[r][1] - scenes[l][ 0] < self.subs_part_len: r += 1 words = map(str.strip, subs.fulltext(scenes[l][0], scenes[r][1]).split()) words = filter(lambda word: word in self.buzz_words, words) words = map(lambda word: self.buzz_words[word], words) subs_part = ' '.join(words) x = list(self.vectorizer.transform([subs_part]).toarray()[0]) x.append(scenes[l][0] / info.duration) y = self.subs_classifier.predict([x])[0] if y == 1: ads.append((scenes[l][0], scenes[r][1])) merged_ads = [] for ad in ads: if len(merged_ads) == 0 or ad[0] - merged_ads[-1][1] > 10: merged_ads.append(ad) else: merged_ads.append((merged_ads.pop()[0], ad[1])) result.append(merged_ads) return result if len(result) > 1 else result[0] def train(self, markups): video_ids = list(markups.keys()) X = [] Y = [] print('start model training') for idx, video_id in enumerate(video_ids): print('\rprocessing video {} ({}/{})'.format( video_id, idx + 1, len(video_ids)), end='') subs = Subtitles(video_id, preprocess_russian_text_with_morph) scenes = self.sdm.detect_scenes(video_id) r = 0 for l in range(len(scenes)): while r + 1 < len(scenes) and scenes[r][1] - scenes[l][ 0] < self.subs_part_len: r += 1 words = map(str.strip, subs.fulltext(scenes[l][0], scenes[r][1]).split()) words = filter(lambda word: word in self.buzz_words, words) words = map(lambda word: self.buzz_words[word], words) subs_part = ' '.join(words) x = list(self.vectorizer.transform([subs_part]).toarray()[0]) info = VideoInfo(video_id) x.append(scenes[l][0] / info.duration) seg = (scenes[l][0], scenes[r][1]) if self._intersect_ad(seg, markups[video_id]): continue X.append(x) if self._inside_ad(seg, markups[video_id]): Y.append(1) else: Y.append(0) X = np.array(X) Y = np.array(Y) self.subs_classifier = MLPClassifier(hidden_layer_sizes=(15, 5), solver='adam', random_state=0, learning_rate='adaptive', max_iter=1000) self.subs_classifier.fit(X, Y) self._fitted = True @staticmethod def _inside_ad(seg, ads): for ad in ads: if ad[0] <= seg[0] and seg[1] <= ad[1]: return True return False @staticmethod def _intersect_ad(seg, ads): for ad in ads: if seg[0] < ad[0] < seg[1] or seg[0] < ad[1] < seg[1]: return True return False