Exemple #1
0
def main():
    init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt")
    clfins = build('./output')

    # list all videos
    videos_path = './data/videos'
    subjects = './data/output'
    for vid in os.listdir(subjects):
        vid_path = os.path.join(subjects, vid)
        output_path = "%s/" % (vid_path)

        video_file = os.path.join(videos_path, vid)

        subjects_file = "%s/%s.subjects" % (vid_path, vid)
        keypoints_file = "%s/%s.keypoints" % (vid_path, vid)

        process(subjects_file, video_file, output_path, keypoints_file)
        # valid_points = extract_point(zip(seconds_list, subjects_list))

        output = open(keypoints_file, mode='w', encoding='utf-8')
        # for second, keyword in output_points.items():
        # 	text = merge_text2(second, sentence_timeline)
        # 	summary = extract_summary(text, keyword, predict)
        # 	output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary))
        output.close()
        print(vid, " cut finished.")
        print()
        print()
        print()
Exemple #2
0
def tokenize(jack_code, xml_format=False):
	tokenizer.init(jack_code)
	tokenizer.run()
	if xml_format:
		return tokenizer.get_tokenized_list()
	else:
		return tokenizer.get_jack_code()
def main(_):  
  os.system('mkdir -p %s' % FLAGS.dir)
  tokenizer.init(FLAGS.tokenizer_vocab)
  global examples, vocab, unk_vocab, char_vocab, pos_vocab, tag_vocab, ner_vocab, ngram_vocab
  examples = pd.read_csv(FLAGS.input)
  #if 'train' in FLAGS.input:
  #  examples = shuffle(examples, random_state=1024)
  vocab = Vocabulary(FLAGS.vocab)
  # unk_vocab is actually a small vocab so will genearte unk for training
  #unk_vocab =  Vocabulary(FLAGS.vocab.replace('vocab.txt', 'unk_vocab.txt'))
  char_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt'))
  pos_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'pos_vocab.txt'))
  tag_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'tag_vocab.txt'))
  ner_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ner_vocab.txt'))
  ngram_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ngram_vocab.txt'))

  global enprob_dict
  enprob_dict = {}
  enprob_file = '~/data/kaggle/toxic/train.enprob.csv' if 'train' in FLAGS.input else '~/data/kaggle/toxic/test.enprob.csv'
  enprob_df = pd.read_csv(enprob_file)
  for id, enprob in zip(enprob_df['id'].values, enprob_df['enprob'].values):
    enprob_dict[id] = enprob
  enprob_dict['0'] = 1.

  pool = multiprocessing.Pool()
  pool.map(build_features, range(FLAGS.num_records))
  pool.close()
  pool.join()

  #build_features(0)

  print('num_records:', counter.value)
  mode = get_mode()
  out_file = os.path.dirname(FLAGS.vocab) + '/{0}/num_records.txt'.format(mode)
  gezi.write_to_txt(counter.value, out_file)
Exemple #4
0
def main(_):
    tokenizer.init(FLAGS.tokenizer_vocab)
    if FLAGS.full_tokenizer:
        gezi.segment.init_spacy_full()

    os.system('mkdir -p %s' % FLAGS.out_dir)

    print('name', FLAGS.name, 'out_dir', FLAGS.out_dir)

    global counter
    counter = WordCounter(write_unknown=FLAGS.write_unknown,
                          most_common=FLAGS.most_common,
                          min_count=FLAGS.min_count)

    global char_counter
    char_counter = WordCounter(write_unknown=FLAGS.write_unknown,
                               most_common=FLAGS.most_common,
                               min_count=FLAGS.min_count)

    global ngram_counter
    ngram_counter = WordCounter(write_unknown=True, min_count=FLAGS.min_count)

    global pos_counter, tag_counter, ner_counter
    pos_counter = WordCounter(write_unknown=True, min_count=1)
    tag_counter = WordCounter(write_unknown=True, min_count=1)
    ner_counter = WordCounter(write_unknown=True, min_count=1)

    run(FLAGS.input)

    if FLAGS.test_input and not FLAGS.name:
        run(FLAGS.test_input, count=FLAGS.test_count)

    if not FLAGS.name:
        vocab_name = FLAGS.vocab_name or 'vocab'
        os.system('mkdir -p %s' % FLAGS.out_dir)
        out_txt = os.path.join(FLAGS.out_dir, '%s.txt' % vocab_name)
        counter.save(out_txt)

        out_txt = os.path.join(FLAGS.out_dir, 'char_%s.txt' % vocab_name)
        char_counter.save(out_txt)

        out_txt = os.path.join(FLAGS.out_dir, 'pos_vocab.txt')
        pos_counter.save(out_txt)

        out_txt = os.path.join(FLAGS.out_dir, 'tag_vocab.txt')
        tag_counter.save(out_txt)

        out_txt = os.path.join(FLAGS.out_dir, 'ner_vocab.txt')
        ner_counter.save(out_txt)

        out_txt = os.path.join(FLAGS.out_dir, 'ngram_vocab.txt')
        if not FLAGS.max_ngrams:
            ngram_counter.save(out_txt)
        else:
            # if later need most 2w ngram head -200000 ngram_vocab.full.txt > ngram_vocab.txt
            out_full_txt = os.path.join(FLAGS.out_dir, 'ngram_vocab.full.txt')
            ngram_counter.save(out_full_txt)
            os.system('head -n %d %s > %s' %
                      (FLAGS.max_ngrams, out_full_txt, out_txt))
def main(_):
    tokenizer.init(FLAGS.tokenizer_vocab)
    os.system('mkdir -p %s' % FLAGS.out_dir)

    print('name', FLAGS.name, 'out_dir', FLAGS.out_dir)

    run(FLAGS.input)

    global name
    name = 'test'
    if FLAGS.test_input and not FLAGS.name:
        run(FLAGS.test_input)
Exemple #6
0
def main():
    init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt")
    clfins = build(config.naive_bayes_model_path)
    # list all videos
    input_path = './data/output'
    output_path = './data/output'
    for vid in os.listdir(input_path):
        caption_file = os.path.join(input_path, vid, vid + '.captions')
        subject_file = "%s/%s/%s.subjects" % (output_path, vid, vid)
        preprocess(caption_file, subject_file, clfins)

        print(vid, " classify finished.")
        print()
        print()
        print()
def main(_):
    tokenizer.init(FLAGS.tokenizer_vocab)
    global examples, vocab, char_vocab
    examples = pd.read_csv(FLAGS.input)
    #if 'train' in FLAGS.input:
    #  examples = shuffle(examples, random_state=1024)
    vocab = Vocabulary(FLAGS.vocab)
    char_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt'))

    pool = multiprocessing.Pool()
    pool.map(build_features, range(FLAGS.num_records))
    pool.close()
    pool.join()

    # build_features(0)

    print('num_records:', counter.value)
    mode = 'train' if 'train' in FLAGS.input else 'test'
    out_file = os.path.dirname(
        FLAGS.vocab) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)
def main(_):
    tokenizer.init(FLAGS.tokenizer_vocab)
    global counter
    counter = WordCounter(write_unknown=FLAGS.write_unknown,
                          most_common=FLAGS.most_common,
                          min_count=FLAGS.min_count)

    global char_counter
    char_counter = WordCounter(write_unknown=FLAGS.write_unknown,
                               most_common=FLAGS.most_common,
                               min_count=FLAGS.min_count)

    run(FLAGS.input)
    if FLAGS.test_input:
        run(FLAGS.test_input, count=FLAGS.test_count)

    vocab_name = FLAGS.vocab_name or 'vocab'
    os.system('mkdir -p %s' % FLAGS.out_dir)
    out_txt = os.path.join(FLAGS.out_dir, '%s.txt' % vocab_name)
    counter.save(out_txt)

    out_txt = os.path.join(FLAGS.out_dir, 'char_%s.txt' % vocab_name)
    char_counter.save(out_txt)
Exemple #9
0
def process(caption_file, output_file, clfins=None):
    if os.path.exists(output_file):
        return output_file

    if clfins is None:
        init(config.jieba_stopwords_path, config.jieba_userdict_path)
        clfins = build(config.naive_bayes_model_path)

    sentence_list = preprocess(caption_file)
    seconds_list, docs_list = zip(*sentence_list)
    predicted_list = clfins.predict_proba(docs_list, 0.5)
    target_list = []

    output = open(output_file, mode='w', encoding='utf-8')
    for second, content, predicted in zip(seconds_list, docs_list,
                                          predicted_list):
        name = clfins.target_name(predicted)
        if name == 'Unpredict':
            name = predict_internal(content)
        target_list.append(name)
        output.write("%s\t%s\t%s\n" % (second, content, name))
    output.close()
    return output_file
Exemple #10
0
    test_predicted = clfins.predict_proba(test_docs, 0.6)
    total = 0
    right = 0
    for filename, doc, category, predict in zip(test_data.filenames, test_docs,
                                                test_data.target,
                                                test_predicted):
        if predict is None:
            continue
        total = total + 1
        if test_data.target_names[category] == clfins.target_name(predict):
            right = right + 1
            continue
        print('%r => %s| right:%s, predict:%s, %s' %
              (filename, doc, test_data.target_names[category],
               clfins.target_name(predict), (predict)))

    print(total, ",", right, ",", right / total)


def build(model_path):
    clfins = Classifier(model_path)
    return clfins


if __name__ == '__main__':
    init("../jieba/stopwords_cn.txt", "../jieba/userdict.txt")
    clfins = build("../models")
    test(clfins)
    test_performence(clfins)
Exemple #11
0
def main():
    clfins = build('./output')
    init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt")
    # list all videos
    videos_folder_path = './data/captions'
    for folder in os.listdir(videos_folder_path):
        if folder != '24CFC579E390B590':
            # if folder != '11AFCABB49FADB3D':
            pass
            # continue

        one_video_folder_path = os.path.join(videos_folder_path, folder)
        caption_file = "./%s/%s_baidu_ocr_result.txt" % (one_video_folder_path,
                                                         folder)
        file_name_output = "./%s/%s_category_result.txt" % (
            one_video_folder_path, folder)
        classify_result_file = "./%s/classify_result.txt" % (
            one_video_folder_path)
        keypoint_result_file = "./%s/keypoint_result.txt" % (
            one_video_folder_path)

        sentence_list = preprocess(caption_file)
        seconds_list, docs_list = zip(*sentence_list)
        predicted_list = clfins.predict_proba(docs_list, 0.5)
        target_list = []

        output = open(classify_result_file, mode='w', encoding='utf-8')
        for second, content, predicted in zip(seconds_list, docs_list,
                                              predicted_list):
            target_list.append(clfins.target_name(predicted))
            output.write("%s\t%s\t%s\n" %
                         (second, content, clfins.target_name(predicted)))
        output.close()

        valid_points = extract_point(zip(seconds_list, target_list))
        print(valid_points)

        def merge_text2(k, dict):  # 以k点为中轴,获取30s内容
            texts = []
            for x in range(k - 2, k + 30):
                if x not in dict.keys():
                    continue
                sss = dict[x].replace("汽车之家", "").replace("之家", "").replace(
                    "汽车之", "").replace("看车买车用车",
                                       "").replace("家看车",
                                                   "").replace("家买车用车",
                                                               "").strip()
                sss = sss.split(" ")[0]
                if x == k - 2:
                    texts.append(sss)
                    continue
                if x - 1 not in dict.keys():
                    continue
                sss0 = dict[x - 1].replace("汽车之家", "").replace(
                    "之家", "").replace("汽车之", "").replace("看车买车用车", "").replace(
                        "家看车", "").replace("家买车用车", "").strip()
                sss0 = sss0.split(" ")[0]
                if tf_similarity(sss, sss0) > 0.8:
                    # print(sss0, sss)
                    continue
                texts.append(sss)
            return ','.join(texts)

        # 重新预测一遍
        sentence_timeline = dict(sentence_list)
        for k in valid_points:
            new_text = merge_text2(k, sentence_timeline)
            # 重新计算分类
            predicted_list = clfins.predict_proba([new_text], 0.5)
            # predicted_list = clfins.predict([new_text])
            valid_points[k] = clfins.target_name(predicted_list[0])
            # print(new_text, clfins.target_name(predicted_list[0]))
        # print(valid_points)

        def merge_points(points):
            reverse_points = {}
            for k, v in points.items():
                if v in reverse_points.keys():
                    reverse_points[v].append(k)
                else:
                    reverse_points[v] = [k]
            if 'Unpredict' in reverse_points.keys():
                reverse_points.pop('Unpredict')

            new_points = {}
            for ks, v in reverse_points.items():
                sortedv = list(v).sort()
                new_points[int(v[0])] = ks

            return new_points

        def predict(docs_list):
            predicted_list = clfins.predict(docs_list)
            return clfins.target_name(predicted_list[0])

        output_points = merge_points(valid_points)
        print(output_points)

        output = open(keypoint_result_file, mode='w', encoding='utf-8')
        for second, keyword in output_points.items():
            text = merge_text2(second, sentence_timeline)
            summary = extract_summary(text, keyword, predict)
            output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary))
        output.close()
        print(folder, " finished.")
        print()
        print()
        print()
Exemple #12
0
def main():
    init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt")
    clfins = build('./output')

    # list all videos
    subjects = './data/output'
    for vid in os.listdir(subjects):
        vid_path = os.path.join(subjects, vid)
        subjects_file = "%s/%s.subjects" % (vid_path, vid)
        keypoints_file = "%s/%s.keypoints" % (vid_path, vid)

        seconds_list = []
        sentences_list = []
        subjects_list = []
        subjects_file = open(subjects_file, mode='r', encoding='utf-8')
        for line in subjects_file.readlines():
            arr = line.split("\t")
            seconds_list.append(int(arr[0]))
            sentences_list.append((arr[1]))
            subjects_list.append((arr[2]))
        subjects_file.close()
        print(subjects_list)

        valid_points = extract_point(zip(seconds_list, subjects_list))
        print(valid_points)

        def merge_text2(k, dict):  # 以k点为中轴,获取30s内容
            texts = []
            for x in range(k - 2, k + 30):
                if x not in dict.keys():
                    continue
                sss = dict[x].replace("汽车之家", "").replace("之家", "").replace("汽车之", "").replace(
                    "看车买车用车", "").replace("家看车", "").replace("家买车用车", "").strip()
                sss = sss.split(" ")[0]
                if x == k - 2:
                    texts.append(sss)
                    continue
                if x - 1 not in dict.keys():
                    continue
                sss0 = dict[x - 1].replace("汽车之家", "").replace("之家", "").replace(
                    "汽车之", "").replace("看车买车用车", "").replace("家看车", "").replace("家买车用车", "").strip()
                sss0 = sss0.split(" ")[0]
                if tf_similarity(sss, sss0) > 0.8:
                    # print(sss0, sss)
                    continue
                texts.append(sss)
            return ','.join(texts)

        # 重新预测一遍
        sentence_timeline = dict(zip(seconds_list, sentences_list))
        for k in valid_points:
            new_text = merge_text2(k, sentence_timeline)
            # 重新计算分类
            predicted_list = clfins.predict_proba([new_text], 0.5)
            # predicted_list = clfins.predict([new_text])
            valid_points[k] = clfins.target_name(predicted_list[0])
            # print(new_text, clfins.target_name(predicted_list[0]))
        # print(valid_points)

        def merge_points(points):
            reverse_points = {}
            for k, v in points.items():
                if v in reverse_points.keys():
                    reverse_points[v].append(k)
                else:
                    reverse_points[v] = [k]
            if 'Unpredict' in reverse_points.keys():
                reverse_points.pop('Unpredict')

            new_points = {}
            for ks, v in reverse_points.items():
                sortedv = list(v).sort()
                new_points[int(v[0])] = ks

            return new_points

        def predict(docs_list):
            predicted_list = clfins.predict(docs_list)
            return clfins.target_name(predicted_list[0])

        output_points = merge_points(valid_points)
        print(output_points)

        output = open(keypoints_file, mode='w', encoding='utf-8')
        for second, keyword in output_points.items():
            text = merge_text2(second, sentence_timeline)
            summary = extract_summary(text, keyword, predict)
            output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary))
        output.close()
        print(vid, " cut finished.")
        print()
        print()
        print()