def data_generator(index_dir, window_size, include_stop_words=False): """Given a directory and a window size outputs a list of (sentence, number of men on screen, number of women on screen, mean number of men on screen, mean number of women on screen, channel) sentence can be with or without stopwords """ # Open the transcript files doc_path = os.path.join(index_dir, 'docs.list') lex_path = os.path.join(index_dir, 'words.lex') idx_path = os.path.join(index_dir, 'index.bin') channel = 'MSNBC' var = {'CNN':(1, 82529), 'FOX': (82530, 162639), 'MSNBC': (162640, 246922)} SIZE = 20000 documents = Documents.load(doc_path) lexicon = Lexicon.load(lex_path) # Getting words words = get_lexicon()
K.set_session(session) else: os.environ["CUDA_VISIBLE_DEVICES"] = str(G) from keras.models import Model from keras.models import model_from_json, load_model from utils import init_predictor, DecodeCTCPred, Readf, edit_distance, normalized_edit_distance, \ BilinearInterpolation, get_lexicon, load_custom_model, open_img, norm, parse_mjsynth prng = RandomState(random_state) model = load_custom_model(model_path, model_name='/model.json', weights="/final_weights.h5") model = init_predictor(model) classes = {j: i for i, j in enumerate(get_lexicon())} inverse_classes = {v: k for k, v in classes.items()} decoder = DecodeCTCPred(top_paths=1, beam_width=10, inverse_classes=inverse_classes) img_size = (imgh, imgW) + (1, ) if validate: if mjsynth: fnames = open(os.path.join(image_path, val_fname), "r").readlines() fnames = np.array(parse_mjsynth(image_path, fnames)) else: fnames = np.array([ os.path.join(dp, f)
def main(index_dir, silent, context_size, folder, use_gender): doc_path = os.path.join(index_dir, 'docs.list') lex_path = os.path.join(index_dir, 'words.lex') idx_path = os.path.join(index_dir, 'index.bin') documents = Documents.load(doc_path) lexicon = Lexicon.load(lex_path) words = get_lexicon() stop_words = set( list(STOP_WORDS) + [ "know", "don", "ve", "say", "way", "said", "ll", "think", "thing", "don’t", "like", "got", "people", "going", "talk", "right", "happened", ">>" ]) print("Stop words", stop_words) doc_idxs = range(144, 246923) word_idx_dic = {} idx_counter = 0 # Create folder if not os.path.exists(folder): os.makedirs(folder) # Create stemmer stemmer = WordNetLemmatizer() with CaptionIndex(idx_path, lexicon, documents) as index: for doc_id in tqdm.tqdm(doc_idxs): dic = {} count = 1 if use_gender: intervals_gender = gender_to_time(str(doc_id), gender_reqs) postings = [] for t1, t2 in intervals_gender: postings.extend(index.intervals(int(doc_id), t1, t2)) else: postings = index.intervals(int(doc_id)) starttime = None for p in postings: if starttime is None: starttime = p.start # Cut after 30s if p.end - starttime > 30 * count: pickle.dump( dic, open( os.path.join( folder, 'Doc_%d_Chunk_%d.p' % (doc_id, count - 1)), 'wb')) dic = {} count += 1 starttime = p.end # Get words in posting tokens = index.tokens(0, p.idx, p.len) if not tokens: continue for token in tokens: word = words[token] # stemmed_word = stemmer.stem(word) if word not in stop_words and len(word) > 1: stemmed_word = stemmer.lemmatize(word) # print("Word {} -> {}".format(word, stemmed_word)) if stemmed_word not in word_idx_dic.keys(): word_idx_dic[stemmed_word] = idx_counter idx_counter += 1 idx_token = word_idx_dic[stemmed_word] if idx_token in dic: dic[idx_token] += 1 else: dic[idx_token] = 1 pickle.dump(word_idx_dic, open(os.path.join(folder, "word_idx.p"), "wb"))
def process_train(corpus_train_path,corpus_test_path,prf_file,base_model_weight=None,flag=6): # 训练语料 raw_train_file = [corpus_train_path + os.sep + type_path + os.sep + type_file \ for type_path in os.listdir(corpus_train_path) \ for type_file in os.listdir(corpus_train_path + os.sep + type_path)] raw_test_file = [corpus_test_path + os.sep + type_path + os.sep + type_file \ for type_path in os.listdir(corpus_test_path) \ for type_file in os.listdir(corpus_test_path + os.sep + type_path)] if flag == 4:# 0 为padding的label 4tag label_2_index = {'Pad': 0, 'B': 1, 'M': 2, 'E': 3, 'S': 4, 'Unk': 5} index_2_label = {0: 'Pad', 1: 'B', 2: 'M', 3: 'E', 4: 'S', 5: 'Unk'} utils.process_data(raw_train_file, 'train.data') utils.process_data(raw_test_file, 'test.data') else: # 6tag label_2_index = {'Pad': 0, 'B': 1, 'B2': 2, 'B3': 3, 'M': 4, 'E': 5, 'S': 6, 'Unk': 7} index_2_label = {0: 'Pad', 1: 'B', 2: 'B2', 3: 'B3', 4: 'M', 5: 'E', 6: 'S', 7: 'Unk'} utils.process_dataB(raw_train_file, 'train.data') utils.process_dataB(raw_test_file, 'test.data') class_label_count = len(label_2_index) train_documents = utils.create_documents('train.data') test_documents = utils.create_documents('test.data') # 生成词典 lexicon, lexicon_reverse = utils.get_lexicon(train_documents+test_documents) # 词典内字符个数 print(len(lexicon), len(lexicon_reverse)) print(len(test_documents)) # 测试语料划分句子个数 print(len(train_documents)) # 训练语料划分句子个数 #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-50.m') #size = 50 #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005.m') #size = 100 #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-150.m') #size = 150 embedding_model = gensim.models.Word2Vec.load(r'model_embedding_pku_100.m') #size = 200 embedding_size = embedding_model.vector_size print(embedding_size) # 预训练词向量 embedding_weights = utils.create_embedding(embedding_model, embedding_size, lexicon_reverse) print(embedding_weights.shape) train_data_list, train_label_list, train_index_list=utils.create_matrix(train_documents,lexicon,label_2_index) test_data_list, test_label_list, test_index_list=utils.create_matrix(test_documents,lexicon,label_2_index) print(len(train_data_list), len(train_label_list), len(train_index_list)) print(len(test_data_list), len(test_label_list), len(test_index_list)) # print(train_data_list[0]) # print(train_label_list[0]) #查看句子长度分布 #print("查看句子长度分布") #visualization.plot_sentence_length(train_data_list+test_data_list,train_label_list+test_label_list) max_len = max(map(len, train_data_list)) print('maxlen:', max_len) #if max_len > 64: # max_len = 64 print('maxlen:', max_len) train_data_array, train_label_list_padding = utils.padding_sentences(train_data_list, train_label_list, max_len) #定长 都是二维数据 test_data_array, test_label_list_padding = utils.padding_sentences(test_data_list, test_label_list, max_len) print(train_data_array.shape) print(test_data_array.shape) #print(train_data_array[0]) train_label_array = np_utils.to_categorical(train_label_list_padding, class_label_count). \ reshape((len(train_label_list_padding), len(train_label_list_padding[0]), -1)) test_label_array = np_utils.to_categorical(test_label_list_padding, class_label_count). \ reshape((len(test_label_list_padding), len(test_label_list_padding[0]), -1)) # 实现多分类问题 变成三维数据 # 测试用的句子个数 * 句子长度 * 6 print(train_label_array.shape) print(test_label_array.shape) # model model = CNN_Bilstm_Crf(max_len, len(lexicon), class_label_count, embedding_weights, embedding_size, model_type) print(model.input_shape) print(model.output_shape) model.summary() model_name = 'model_%d.png'%model_type #plot_model(model, to_file=model_name, show_shapes=True, show_layer_names=True) train_nums = len(train_data_array) # 对应的train_data_list填充0后就是 train_data_array 填充0后的字在字典中的索引 train_array, val_array = train_data_array[:int(train_nums * 0.9)], train_data_array[int(train_nums * 0.9):] # 0.9的行用于训练 0.1的行用于防止过拟合 train_label, val_label = train_label_array[:int(train_nums * 0.9)], train_label_array[int(train_nums * 0.9):] checkpointer = ModelCheckpoint(filepath='train_model_pku_100_m6.hdf5', verbose=1, \ save_best_only=True, monitor='val_loss', mode='auto') hist = model.fit(train_array, train_label, batch_size=256, epochs=4, verbose=1,validation_data=(val_array,val_label),callbacks=[checkpointer]) # save model model.save_weights('train_model_pku_100_m6.hdf5') print(hist.history['val_loss']) # 记录下每次的平均损失大小 best_model_epoch = np.argmin(hist.history['val_loss']) print('best_model_epoch:', best_model_epoch) # 可视化loss acc #visualization.plot_acc_loss(hist) #visualization.plot_acc(hist) #visualization.plot_loss(hist) print(hist.history) model.load_weights('train_model_pku_100_m6.hdf5') # test_data_array 是测试句子个数 * 句子索引中各字在字典中(填充0后)的长度(填充0后) test_y_pred = model.predict(test_data_array,batch_size=256,verbose=1) # 本函数按batch获得输入数据对应的输出,函数的返回值是预测值的numpy array print("test_y_pred.shape:") # 测试句子个数 * 测试句子长度 * 5 print(test_y_pred.shape) #句子个数 * 句子长度 * 5 # pred_label是预测出的标签 [0,0,....,1,2,3,1] 句子个数 * 句子长度 pred_label = np.argmax(test_y_pred,axis=2) # 预测出的测试句子个数 * 句子长度 # save lexicon pickle.dump([lexicon, lexicon_reverse, max_len, index_2_label], open('lexicon_pku_100_m6.pkl', 'wb')) K.clear_session() # 清除session中的缓存数据 # 生成输出文档 # 字典大小 lexicon_reverse: {index:char} real_text_list, pred_text_list, real_label_list, pred_label_list = utils.create_pred_text( \ lexicon_reverse, test_data_array, pred_label, test_label_list_padding, test_index_list, class_label_count) # {index:char}, 测试句子个数 * 句子长度(填充0后),# 预测出的测试句子个数 * 句子长度,test_label_list_padding对应标签填充0后的数据,每一行的索引 # 写进文件 utils.write_2_file(real_text_list, pred_text_list) # score F = score.prf_score('real_text.txt', 'pred_text.txt', prf_file,model_type, best_model_epoch,class_label_count) # 返回平均值
__author__ = "Olivares Castillo José Luis" #tf.enable_eager_execution() tf.logging.set_verbosity(tf.logging.INFO) print("TensorFlow version: {}".format(tf.VERSION)) #print("Eager execution: {}".format(tf.executing_eagerly())) if tf.test.gpu_device_name(): print("GPU disponible") #source_lex = "en-it.test" source_lex = "es-na.test" words_scr_lexicon, words_trg_lexicon = utils.get_lexicon(source_lex) print("size of lexicon:", set(words_scr_lexicon).__len__()) #print(len(words_scr_lexicon), len(words_trg_lexicon)) source_str = "es.n2v" target_str = "na.n2v" #source_str = "es.norm.n2v" #source_str = "en.fst" source_vec = utils.open_file(source_str) words_src, source_vec = utils.read(source_vec, is_zipped=False) # lista de palabras en español del lexicon semilla eval_src = list(set(words_scr_lexicon)) src_vec = utils.get_vectors(eval_src, words_src, source_vec) print("source_vec: " + source_str) #print(src_vec.shape)