Esempio n. 1
0
def main():
    init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt")
    clfins = build('./output')

    # list all videos
    videos_path = './data/videos'
    subjects = './data/output'
    for vid in os.listdir(subjects):
        vid_path = os.path.join(subjects, vid)
        output_path = "%s/" % (vid_path)

        video_file = os.path.join(videos_path, vid)

        subjects_file = "%s/%s.subjects" % (vid_path, vid)
        keypoints_file = "%s/%s.keypoints" % (vid_path, vid)

        process(subjects_file, video_file, output_path, keypoints_file)
        # valid_points = extract_point(zip(seconds_list, subjects_list))

        output = open(keypoints_file, mode='w', encoding='utf-8')
        # for second, keyword in output_points.items():
        # 	text = merge_text2(second, sentence_timeline)
        # 	summary = extract_summary(text, keyword, predict)
        # 	output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary))
        output.close()
        print(vid, " cut finished.")
        print()
        print()
        print()
Esempio n. 2
0
def main():
    init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt")
    clfins = build(config.naive_bayes_model_path)
    # list all videos
    input_path = './data/output'
    output_path = './data/output'
    for vid in os.listdir(input_path):
        caption_file = os.path.join(input_path, vid, vid + '.captions')
        subject_file = "%s/%s/%s.subjects" % (output_path, vid, vid)
        preprocess(caption_file, subject_file, clfins)

        print(vid, " classify finished.")
        print()
        print()
        print()
Esempio n. 3
0
def process(caption_file, output_file, clfins=None):
    if os.path.exists(output_file):
        return output_file

    if clfins is None:
        init(config.jieba_stopwords_path, config.jieba_userdict_path)
        clfins = build(config.naive_bayes_model_path)

    sentence_list = preprocess(caption_file)
    seconds_list, docs_list = zip(*sentence_list)
    predicted_list = clfins.predict_proba(docs_list, 0.5)
    target_list = []

    output = open(output_file, mode='w', encoding='utf-8')
    for second, content, predicted in zip(seconds_list, docs_list,
                                          predicted_list):
        name = clfins.target_name(predicted)
        if name == 'Unpredict':
            name = predict_internal(content)
        target_list.append(name)
        output.write("%s\t%s\t%s\n" % (second, content, name))
    output.close()
    return output_file
Esempio n. 4
0
def main():
    clfins = build('./output')
    init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt")
    # list all videos
    videos_folder_path = './data/captions'
    for folder in os.listdir(videos_folder_path):
        if folder != '24CFC579E390B590':
            # if folder != '11AFCABB49FADB3D':
            pass
            # continue

        one_video_folder_path = os.path.join(videos_folder_path, folder)
        caption_file = "./%s/%s_baidu_ocr_result.txt" % (one_video_folder_path,
                                                         folder)
        file_name_output = "./%s/%s_category_result.txt" % (
            one_video_folder_path, folder)
        classify_result_file = "./%s/classify_result.txt" % (
            one_video_folder_path)
        keypoint_result_file = "./%s/keypoint_result.txt" % (
            one_video_folder_path)

        sentence_list = preprocess(caption_file)
        seconds_list, docs_list = zip(*sentence_list)
        predicted_list = clfins.predict_proba(docs_list, 0.5)
        target_list = []

        output = open(classify_result_file, mode='w', encoding='utf-8')
        for second, content, predicted in zip(seconds_list, docs_list,
                                              predicted_list):
            target_list.append(clfins.target_name(predicted))
            output.write("%s\t%s\t%s\n" %
                         (second, content, clfins.target_name(predicted)))
        output.close()

        valid_points = extract_point(zip(seconds_list, target_list))
        print(valid_points)

        def merge_text2(k, dict):  # 以k点为中轴,获取30s内容
            texts = []
            for x in range(k - 2, k + 30):
                if x not in dict.keys():
                    continue
                sss = dict[x].replace("汽车之家", "").replace("之家", "").replace(
                    "汽车之", "").replace("看车买车用车",
                                       "").replace("家看车",
                                                   "").replace("家买车用车",
                                                               "").strip()
                sss = sss.split(" ")[0]
                if x == k - 2:
                    texts.append(sss)
                    continue
                if x - 1 not in dict.keys():
                    continue
                sss0 = dict[x - 1].replace("汽车之家", "").replace(
                    "之家", "").replace("汽车之", "").replace("看车买车用车", "").replace(
                        "家看车", "").replace("家买车用车", "").strip()
                sss0 = sss0.split(" ")[0]
                if tf_similarity(sss, sss0) > 0.8:
                    # print(sss0, sss)
                    continue
                texts.append(sss)
            return ','.join(texts)

        # 重新预测一遍
        sentence_timeline = dict(sentence_list)
        for k in valid_points:
            new_text = merge_text2(k, sentence_timeline)
            # 重新计算分类
            predicted_list = clfins.predict_proba([new_text], 0.5)
            # predicted_list = clfins.predict([new_text])
            valid_points[k] = clfins.target_name(predicted_list[0])
            # print(new_text, clfins.target_name(predicted_list[0]))
        # print(valid_points)

        def merge_points(points):
            reverse_points = {}
            for k, v in points.items():
                if v in reverse_points.keys():
                    reverse_points[v].append(k)
                else:
                    reverse_points[v] = [k]
            if 'Unpredict' in reverse_points.keys():
                reverse_points.pop('Unpredict')

            new_points = {}
            for ks, v in reverse_points.items():
                sortedv = list(v).sort()
                new_points[int(v[0])] = ks

            return new_points

        def predict(docs_list):
            predicted_list = clfins.predict(docs_list)
            return clfins.target_name(predicted_list[0])

        output_points = merge_points(valid_points)
        print(output_points)

        output = open(keypoint_result_file, mode='w', encoding='utf-8')
        for second, keyword in output_points.items():
            text = merge_text2(second, sentence_timeline)
            summary = extract_summary(text, keyword, predict)
            output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary))
        output.close()
        print(folder, " finished.")
        print()
        print()
        print()
Esempio n. 5
0
def main():
    init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt")
    clfins = build('./output')

    # list all videos
    subjects = './data/output'
    for vid in os.listdir(subjects):
        vid_path = os.path.join(subjects, vid)
        subjects_file = "%s/%s.subjects" % (vid_path, vid)
        keypoints_file = "%s/%s.keypoints" % (vid_path, vid)

        seconds_list = []
        sentences_list = []
        subjects_list = []
        subjects_file = open(subjects_file, mode='r', encoding='utf-8')
        for line in subjects_file.readlines():
            arr = line.split("\t")
            seconds_list.append(int(arr[0]))
            sentences_list.append((arr[1]))
            subjects_list.append((arr[2]))
        subjects_file.close()
        print(subjects_list)

        valid_points = extract_point(zip(seconds_list, subjects_list))
        print(valid_points)

        def merge_text2(k, dict):  # 以k点为中轴,获取30s内容
            texts = []
            for x in range(k - 2, k + 30):
                if x not in dict.keys():
                    continue
                sss = dict[x].replace("汽车之家", "").replace("之家", "").replace("汽车之", "").replace(
                    "看车买车用车", "").replace("家看车", "").replace("家买车用车", "").strip()
                sss = sss.split(" ")[0]
                if x == k - 2:
                    texts.append(sss)
                    continue
                if x - 1 not in dict.keys():
                    continue
                sss0 = dict[x - 1].replace("汽车之家", "").replace("之家", "").replace(
                    "汽车之", "").replace("看车买车用车", "").replace("家看车", "").replace("家买车用车", "").strip()
                sss0 = sss0.split(" ")[0]
                if tf_similarity(sss, sss0) > 0.8:
                    # print(sss0, sss)
                    continue
                texts.append(sss)
            return ','.join(texts)

        # 重新预测一遍
        sentence_timeline = dict(zip(seconds_list, sentences_list))
        for k in valid_points:
            new_text = merge_text2(k, sentence_timeline)
            # 重新计算分类
            predicted_list = clfins.predict_proba([new_text], 0.5)
            # predicted_list = clfins.predict([new_text])
            valid_points[k] = clfins.target_name(predicted_list[0])
            # print(new_text, clfins.target_name(predicted_list[0]))
        # print(valid_points)

        def merge_points(points):
            reverse_points = {}
            for k, v in points.items():
                if v in reverse_points.keys():
                    reverse_points[v].append(k)
                else:
                    reverse_points[v] = [k]
            if 'Unpredict' in reverse_points.keys():
                reverse_points.pop('Unpredict')

            new_points = {}
            for ks, v in reverse_points.items():
                sortedv = list(v).sort()
                new_points[int(v[0])] = ks

            return new_points

        def predict(docs_list):
            predicted_list = clfins.predict(docs_list)
            return clfins.target_name(predicted_list[0])

        output_points = merge_points(valid_points)
        print(output_points)

        output = open(keypoints_file, mode='w', encoding='utf-8')
        for second, keyword in output_points.items():
            text = merge_text2(second, sentence_timeline)
            summary = extract_summary(text, keyword, predict)
            output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary))
        output.close()
        print(vid, " cut finished.")
        print()
        print()
        print()