def main(): init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt") clfins = build('./output') # list all videos videos_path = './data/videos' subjects = './data/output' for vid in os.listdir(subjects): vid_path = os.path.join(subjects, vid) output_path = "%s/" % (vid_path) video_file = os.path.join(videos_path, vid) subjects_file = "%s/%s.subjects" % (vid_path, vid) keypoints_file = "%s/%s.keypoints" % (vid_path, vid) process(subjects_file, video_file, output_path, keypoints_file) # valid_points = extract_point(zip(seconds_list, subjects_list)) output = open(keypoints_file, mode='w', encoding='utf-8') # for second, keyword in output_points.items(): # text = merge_text2(second, sentence_timeline) # summary = extract_summary(text, keyword, predict) # output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary)) output.close() print(vid, " cut finished.") print() print() print()
def main(): init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt") clfins = build(config.naive_bayes_model_path) # list all videos input_path = './data/output' output_path = './data/output' for vid in os.listdir(input_path): caption_file = os.path.join(input_path, vid, vid + '.captions') subject_file = "%s/%s/%s.subjects" % (output_path, vid, vid) preprocess(caption_file, subject_file, clfins) print(vid, " classify finished.") print() print() print()
def process(caption_file, output_file, clfins=None): if os.path.exists(output_file): return output_file if clfins is None: init(config.jieba_stopwords_path, config.jieba_userdict_path) clfins = build(config.naive_bayes_model_path) sentence_list = preprocess(caption_file) seconds_list, docs_list = zip(*sentence_list) predicted_list = clfins.predict_proba(docs_list, 0.5) target_list = [] output = open(output_file, mode='w', encoding='utf-8') for second, content, predicted in zip(seconds_list, docs_list, predicted_list): name = clfins.target_name(predicted) if name == 'Unpredict': name = predict_internal(content) target_list.append(name) output.write("%s\t%s\t%s\n" % (second, content, name)) output.close() return output_file
def main(): clfins = build('./output') init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt") # list all videos videos_folder_path = './data/captions' for folder in os.listdir(videos_folder_path): if folder != '24CFC579E390B590': # if folder != '11AFCABB49FADB3D': pass # continue one_video_folder_path = os.path.join(videos_folder_path, folder) caption_file = "./%s/%s_baidu_ocr_result.txt" % (one_video_folder_path, folder) file_name_output = "./%s/%s_category_result.txt" % ( one_video_folder_path, folder) classify_result_file = "./%s/classify_result.txt" % ( one_video_folder_path) keypoint_result_file = "./%s/keypoint_result.txt" % ( one_video_folder_path) sentence_list = preprocess(caption_file) seconds_list, docs_list = zip(*sentence_list) predicted_list = clfins.predict_proba(docs_list, 0.5) target_list = [] output = open(classify_result_file, mode='w', encoding='utf-8') for second, content, predicted in zip(seconds_list, docs_list, predicted_list): target_list.append(clfins.target_name(predicted)) output.write("%s\t%s\t%s\n" % (second, content, clfins.target_name(predicted))) output.close() valid_points = extract_point(zip(seconds_list, target_list)) print(valid_points) def merge_text2(k, dict): # 以k点为中轴,获取30s内容 texts = [] for x in range(k - 2, k + 30): if x not in dict.keys(): continue sss = dict[x].replace("汽车之家", "").replace("之家", "").replace( "汽车之", "").replace("看车买车用车", "").replace("家看车", "").replace("家买车用车", "").strip() sss = sss.split(" ")[0] if x == k - 2: texts.append(sss) continue if x - 1 not in dict.keys(): continue sss0 = dict[x - 1].replace("汽车之家", "").replace( "之家", "").replace("汽车之", "").replace("看车买车用车", "").replace( "家看车", "").replace("家买车用车", "").strip() sss0 = sss0.split(" ")[0] if tf_similarity(sss, sss0) > 0.8: # print(sss0, sss) continue texts.append(sss) return ','.join(texts) # 重新预测一遍 sentence_timeline = dict(sentence_list) for k in valid_points: new_text = merge_text2(k, sentence_timeline) # 重新计算分类 predicted_list = clfins.predict_proba([new_text], 0.5) # predicted_list = clfins.predict([new_text]) valid_points[k] = clfins.target_name(predicted_list[0]) # print(new_text, clfins.target_name(predicted_list[0])) # print(valid_points) def merge_points(points): reverse_points = {} for k, v in points.items(): if v in reverse_points.keys(): reverse_points[v].append(k) else: reverse_points[v] = [k] if 'Unpredict' in reverse_points.keys(): reverse_points.pop('Unpredict') new_points = {} for ks, v in reverse_points.items(): sortedv = list(v).sort() new_points[int(v[0])] = ks return new_points def predict(docs_list): predicted_list = clfins.predict(docs_list) return clfins.target_name(predicted_list[0]) output_points = merge_points(valid_points) print(output_points) output = open(keypoint_result_file, mode='w', encoding='utf-8') for second, keyword in output_points.items(): text = merge_text2(second, sentence_timeline) summary = extract_summary(text, keyword, predict) output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary)) output.close() print(folder, " finished.") print() print() print()
def main(): init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt") clfins = build('./output') # list all videos subjects = './data/output' for vid in os.listdir(subjects): vid_path = os.path.join(subjects, vid) subjects_file = "%s/%s.subjects" % (vid_path, vid) keypoints_file = "%s/%s.keypoints" % (vid_path, vid) seconds_list = [] sentences_list = [] subjects_list = [] subjects_file = open(subjects_file, mode='r', encoding='utf-8') for line in subjects_file.readlines(): arr = line.split("\t") seconds_list.append(int(arr[0])) sentences_list.append((arr[1])) subjects_list.append((arr[2])) subjects_file.close() print(subjects_list) valid_points = extract_point(zip(seconds_list, subjects_list)) print(valid_points) def merge_text2(k, dict): # 以k点为中轴,获取30s内容 texts = [] for x in range(k - 2, k + 30): if x not in dict.keys(): continue sss = dict[x].replace("汽车之家", "").replace("之家", "").replace("汽车之", "").replace( "看车买车用车", "").replace("家看车", "").replace("家买车用车", "").strip() sss = sss.split(" ")[0] if x == k - 2: texts.append(sss) continue if x - 1 not in dict.keys(): continue sss0 = dict[x - 1].replace("汽车之家", "").replace("之家", "").replace( "汽车之", "").replace("看车买车用车", "").replace("家看车", "").replace("家买车用车", "").strip() sss0 = sss0.split(" ")[0] if tf_similarity(sss, sss0) > 0.8: # print(sss0, sss) continue texts.append(sss) return ','.join(texts) # 重新预测一遍 sentence_timeline = dict(zip(seconds_list, sentences_list)) for k in valid_points: new_text = merge_text2(k, sentence_timeline) # 重新计算分类 predicted_list = clfins.predict_proba([new_text], 0.5) # predicted_list = clfins.predict([new_text]) valid_points[k] = clfins.target_name(predicted_list[0]) # print(new_text, clfins.target_name(predicted_list[0])) # print(valid_points) def merge_points(points): reverse_points = {} for k, v in points.items(): if v in reverse_points.keys(): reverse_points[v].append(k) else: reverse_points[v] = [k] if 'Unpredict' in reverse_points.keys(): reverse_points.pop('Unpredict') new_points = {} for ks, v in reverse_points.items(): sortedv = list(v).sort() new_points[int(v[0])] = ks return new_points def predict(docs_list): predicted_list = clfins.predict(docs_list) return clfins.target_name(predicted_list[0]) output_points = merge_points(valid_points) print(output_points) output = open(keypoints_file, mode='w', encoding='utf-8') for second, keyword in output_points.items(): text = merge_text2(second, sentence_timeline) summary = extract_summary(text, keyword, predict) output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary)) output.close() print(vid, " cut finished.") print() print() print()