def main(): init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt") clfins = build('./output') # list all videos videos_path = './data/videos' subjects = './data/output' for vid in os.listdir(subjects): vid_path = os.path.join(subjects, vid) output_path = "%s/" % (vid_path) video_file = os.path.join(videos_path, vid) subjects_file = "%s/%s.subjects" % (vid_path, vid) keypoints_file = "%s/%s.keypoints" % (vid_path, vid) process(subjects_file, video_file, output_path, keypoints_file) # valid_points = extract_point(zip(seconds_list, subjects_list)) output = open(keypoints_file, mode='w', encoding='utf-8') # for second, keyword in output_points.items(): # text = merge_text2(second, sentence_timeline) # summary = extract_summary(text, keyword, predict) # output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary)) output.close() print(vid, " cut finished.") print() print() print()
def tokenize(jack_code, xml_format=False): tokenizer.init(jack_code) tokenizer.run() if xml_format: return tokenizer.get_tokenized_list() else: return tokenizer.get_jack_code()
def main(_): os.system('mkdir -p %s' % FLAGS.dir) tokenizer.init(FLAGS.tokenizer_vocab) global examples, vocab, unk_vocab, char_vocab, pos_vocab, tag_vocab, ner_vocab, ngram_vocab examples = pd.read_csv(FLAGS.input) #if 'train' in FLAGS.input: # examples = shuffle(examples, random_state=1024) vocab = Vocabulary(FLAGS.vocab) # unk_vocab is actually a small vocab so will genearte unk for training #unk_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'unk_vocab.txt')) char_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt')) pos_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'pos_vocab.txt')) tag_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'tag_vocab.txt')) ner_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ner_vocab.txt')) ngram_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ngram_vocab.txt')) global enprob_dict enprob_dict = {} enprob_file = '~/data/kaggle/toxic/train.enprob.csv' if 'train' in FLAGS.input else '~/data/kaggle/toxic/test.enprob.csv' enprob_df = pd.read_csv(enprob_file) for id, enprob in zip(enprob_df['id'].values, enprob_df['enprob'].values): enprob_dict[id] = enprob enprob_dict['0'] = 1. pool = multiprocessing.Pool() pool.map(build_features, range(FLAGS.num_records)) pool.close() pool.join() #build_features(0) print('num_records:', counter.value) mode = get_mode() out_file = os.path.dirname(FLAGS.vocab) + '/{0}/num_records.txt'.format(mode) gezi.write_to_txt(counter.value, out_file)
def main(_): tokenizer.init(FLAGS.tokenizer_vocab) if FLAGS.full_tokenizer: gezi.segment.init_spacy_full() os.system('mkdir -p %s' % FLAGS.out_dir) print('name', FLAGS.name, 'out_dir', FLAGS.out_dir) global counter counter = WordCounter(write_unknown=FLAGS.write_unknown, most_common=FLAGS.most_common, min_count=FLAGS.min_count) global char_counter char_counter = WordCounter(write_unknown=FLAGS.write_unknown, most_common=FLAGS.most_common, min_count=FLAGS.min_count) global ngram_counter ngram_counter = WordCounter(write_unknown=True, min_count=FLAGS.min_count) global pos_counter, tag_counter, ner_counter pos_counter = WordCounter(write_unknown=True, min_count=1) tag_counter = WordCounter(write_unknown=True, min_count=1) ner_counter = WordCounter(write_unknown=True, min_count=1) run(FLAGS.input) if FLAGS.test_input and not FLAGS.name: run(FLAGS.test_input, count=FLAGS.test_count) if not FLAGS.name: vocab_name = FLAGS.vocab_name or 'vocab' os.system('mkdir -p %s' % FLAGS.out_dir) out_txt = os.path.join(FLAGS.out_dir, '%s.txt' % vocab_name) counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'char_%s.txt' % vocab_name) char_counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'pos_vocab.txt') pos_counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'tag_vocab.txt') tag_counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'ner_vocab.txt') ner_counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'ngram_vocab.txt') if not FLAGS.max_ngrams: ngram_counter.save(out_txt) else: # if later need most 2w ngram head -200000 ngram_vocab.full.txt > ngram_vocab.txt out_full_txt = os.path.join(FLAGS.out_dir, 'ngram_vocab.full.txt') ngram_counter.save(out_full_txt) os.system('head -n %d %s > %s' % (FLAGS.max_ngrams, out_full_txt, out_txt))
def main(_): tokenizer.init(FLAGS.tokenizer_vocab) os.system('mkdir -p %s' % FLAGS.out_dir) print('name', FLAGS.name, 'out_dir', FLAGS.out_dir) run(FLAGS.input) global name name = 'test' if FLAGS.test_input and not FLAGS.name: run(FLAGS.test_input)
def main(): init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt") clfins = build(config.naive_bayes_model_path) # list all videos input_path = './data/output' output_path = './data/output' for vid in os.listdir(input_path): caption_file = os.path.join(input_path, vid, vid + '.captions') subject_file = "%s/%s/%s.subjects" % (output_path, vid, vid) preprocess(caption_file, subject_file, clfins) print(vid, " classify finished.") print() print() print()
def main(_): tokenizer.init(FLAGS.tokenizer_vocab) global examples, vocab, char_vocab examples = pd.read_csv(FLAGS.input) #if 'train' in FLAGS.input: # examples = shuffle(examples, random_state=1024) vocab = Vocabulary(FLAGS.vocab) char_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt')) pool = multiprocessing.Pool() pool.map(build_features, range(FLAGS.num_records)) pool.close() pool.join() # build_features(0) print('num_records:', counter.value) mode = 'train' if 'train' in FLAGS.input else 'test' out_file = os.path.dirname( FLAGS.vocab) + '/{0}/num_records.txt'.format(mode) gezi.write_to_txt(counter.value, out_file)
def main(_): tokenizer.init(FLAGS.tokenizer_vocab) global counter counter = WordCounter(write_unknown=FLAGS.write_unknown, most_common=FLAGS.most_common, min_count=FLAGS.min_count) global char_counter char_counter = WordCounter(write_unknown=FLAGS.write_unknown, most_common=FLAGS.most_common, min_count=FLAGS.min_count) run(FLAGS.input) if FLAGS.test_input: run(FLAGS.test_input, count=FLAGS.test_count) vocab_name = FLAGS.vocab_name or 'vocab' os.system('mkdir -p %s' % FLAGS.out_dir) out_txt = os.path.join(FLAGS.out_dir, '%s.txt' % vocab_name) counter.save(out_txt) out_txt = os.path.join(FLAGS.out_dir, 'char_%s.txt' % vocab_name) char_counter.save(out_txt)
def process(caption_file, output_file, clfins=None): if os.path.exists(output_file): return output_file if clfins is None: init(config.jieba_stopwords_path, config.jieba_userdict_path) clfins = build(config.naive_bayes_model_path) sentence_list = preprocess(caption_file) seconds_list, docs_list = zip(*sentence_list) predicted_list = clfins.predict_proba(docs_list, 0.5) target_list = [] output = open(output_file, mode='w', encoding='utf-8') for second, content, predicted in zip(seconds_list, docs_list, predicted_list): name = clfins.target_name(predicted) if name == 'Unpredict': name = predict_internal(content) target_list.append(name) output.write("%s\t%s\t%s\n" % (second, content, name)) output.close() return output_file
test_predicted = clfins.predict_proba(test_docs, 0.6) total = 0 right = 0 for filename, doc, category, predict in zip(test_data.filenames, test_docs, test_data.target, test_predicted): if predict is None: continue total = total + 1 if test_data.target_names[category] == clfins.target_name(predict): right = right + 1 continue print('%r => %s| right:%s, predict:%s, %s' % (filename, doc, test_data.target_names[category], clfins.target_name(predict), (predict))) print(total, ",", right, ",", right / total) def build(model_path): clfins = Classifier(model_path) return clfins if __name__ == '__main__': init("../jieba/stopwords_cn.txt", "../jieba/userdict.txt") clfins = build("../models") test(clfins) test_performence(clfins)
def main(): clfins = build('./output') init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt") # list all videos videos_folder_path = './data/captions' for folder in os.listdir(videos_folder_path): if folder != '24CFC579E390B590': # if folder != '11AFCABB49FADB3D': pass # continue one_video_folder_path = os.path.join(videos_folder_path, folder) caption_file = "./%s/%s_baidu_ocr_result.txt" % (one_video_folder_path, folder) file_name_output = "./%s/%s_category_result.txt" % ( one_video_folder_path, folder) classify_result_file = "./%s/classify_result.txt" % ( one_video_folder_path) keypoint_result_file = "./%s/keypoint_result.txt" % ( one_video_folder_path) sentence_list = preprocess(caption_file) seconds_list, docs_list = zip(*sentence_list) predicted_list = clfins.predict_proba(docs_list, 0.5) target_list = [] output = open(classify_result_file, mode='w', encoding='utf-8') for second, content, predicted in zip(seconds_list, docs_list, predicted_list): target_list.append(clfins.target_name(predicted)) output.write("%s\t%s\t%s\n" % (second, content, clfins.target_name(predicted))) output.close() valid_points = extract_point(zip(seconds_list, target_list)) print(valid_points) def merge_text2(k, dict): # 以k点为中轴,获取30s内容 texts = [] for x in range(k - 2, k + 30): if x not in dict.keys(): continue sss = dict[x].replace("汽车之家", "").replace("之家", "").replace( "汽车之", "").replace("看车买车用车", "").replace("家看车", "").replace("家买车用车", "").strip() sss = sss.split(" ")[0] if x == k - 2: texts.append(sss) continue if x - 1 not in dict.keys(): continue sss0 = dict[x - 1].replace("汽车之家", "").replace( "之家", "").replace("汽车之", "").replace("看车买车用车", "").replace( "家看车", "").replace("家买车用车", "").strip() sss0 = sss0.split(" ")[0] if tf_similarity(sss, sss0) > 0.8: # print(sss0, sss) continue texts.append(sss) return ','.join(texts) # 重新预测一遍 sentence_timeline = dict(sentence_list) for k in valid_points: new_text = merge_text2(k, sentence_timeline) # 重新计算分类 predicted_list = clfins.predict_proba([new_text], 0.5) # predicted_list = clfins.predict([new_text]) valid_points[k] = clfins.target_name(predicted_list[0]) # print(new_text, clfins.target_name(predicted_list[0])) # print(valid_points) def merge_points(points): reverse_points = {} for k, v in points.items(): if v in reverse_points.keys(): reverse_points[v].append(k) else: reverse_points[v] = [k] if 'Unpredict' in reverse_points.keys(): reverse_points.pop('Unpredict') new_points = {} for ks, v in reverse_points.items(): sortedv = list(v).sort() new_points[int(v[0])] = ks return new_points def predict(docs_list): predicted_list = clfins.predict(docs_list) return clfins.target_name(predicted_list[0]) output_points = merge_points(valid_points) print(output_points) output = open(keypoint_result_file, mode='w', encoding='utf-8') for second, keyword in output_points.items(): text = merge_text2(second, sentence_timeline) summary = extract_summary(text, keyword, predict) output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary)) output.close() print(folder, " finished.") print() print() print()
def main(): init("./jieba/stopwords_cn.txt", "./jieba/userdict.txt") clfins = build('./output') # list all videos subjects = './data/output' for vid in os.listdir(subjects): vid_path = os.path.join(subjects, vid) subjects_file = "%s/%s.subjects" % (vid_path, vid) keypoints_file = "%s/%s.keypoints" % (vid_path, vid) seconds_list = [] sentences_list = [] subjects_list = [] subjects_file = open(subjects_file, mode='r', encoding='utf-8') for line in subjects_file.readlines(): arr = line.split("\t") seconds_list.append(int(arr[0])) sentences_list.append((arr[1])) subjects_list.append((arr[2])) subjects_file.close() print(subjects_list) valid_points = extract_point(zip(seconds_list, subjects_list)) print(valid_points) def merge_text2(k, dict): # 以k点为中轴,获取30s内容 texts = [] for x in range(k - 2, k + 30): if x not in dict.keys(): continue sss = dict[x].replace("汽车之家", "").replace("之家", "").replace("汽车之", "").replace( "看车买车用车", "").replace("家看车", "").replace("家买车用车", "").strip() sss = sss.split(" ")[0] if x == k - 2: texts.append(sss) continue if x - 1 not in dict.keys(): continue sss0 = dict[x - 1].replace("汽车之家", "").replace("之家", "").replace( "汽车之", "").replace("看车买车用车", "").replace("家看车", "").replace("家买车用车", "").strip() sss0 = sss0.split(" ")[0] if tf_similarity(sss, sss0) > 0.8: # print(sss0, sss) continue texts.append(sss) return ','.join(texts) # 重新预测一遍 sentence_timeline = dict(zip(seconds_list, sentences_list)) for k in valid_points: new_text = merge_text2(k, sentence_timeline) # 重新计算分类 predicted_list = clfins.predict_proba([new_text], 0.5) # predicted_list = clfins.predict([new_text]) valid_points[k] = clfins.target_name(predicted_list[0]) # print(new_text, clfins.target_name(predicted_list[0])) # print(valid_points) def merge_points(points): reverse_points = {} for k, v in points.items(): if v in reverse_points.keys(): reverse_points[v].append(k) else: reverse_points[v] = [k] if 'Unpredict' in reverse_points.keys(): reverse_points.pop('Unpredict') new_points = {} for ks, v in reverse_points.items(): sortedv = list(v).sort() new_points[int(v[0])] = ks return new_points def predict(docs_list): predicted_list = clfins.predict(docs_list) return clfins.target_name(predicted_list[0]) output_points = merge_points(valid_points) print(output_points) output = open(keypoints_file, mode='w', encoding='utf-8') for second, keyword in output_points.items(): text = merge_text2(second, sentence_timeline) summary = extract_summary(text, keyword, predict) output.write("%s\t%s\t%s|%s\n" % (second, '', keyword, summary)) output.close() print(vid, " cut finished.") print() print() print()