Ejemplo n.º 1
0
def data_generator(index_dir, window_size, include_stop_words=False):
    """Given a directory and a window size outputs a list of
    (sentence, number of men on screen, 
               number of women on screen,
               mean number of men on screen, 
               mean number of women on screen, 
               channel)

    sentence can be with or without stopwords
    """

    # Open the transcript files
    doc_path = os.path.join(index_dir, 'docs.list')
    lex_path = os.path.join(index_dir, 'words.lex')
    idx_path = os.path.join(index_dir, 'index.bin')

    channel = 'MSNBC'
    var = {'CNN':(1, 82529), 'FOX': (82530, 162639), 'MSNBC': (162640, 246922)}
    SIZE = 20000

    documents = Documents.load(doc_path)
    lexicon = Lexicon.load(lex_path)

    # Getting words
    words = get_lexicon()
Ejemplo n.º 2
0
        K.set_session(session)
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(G)

    from keras.models import Model
    from keras.models import model_from_json, load_model

    from utils import init_predictor, DecodeCTCPred, Readf, edit_distance, normalized_edit_distance, \
                        BilinearInterpolation, get_lexicon, load_custom_model, open_img, norm, parse_mjsynth

    prng = RandomState(random_state)
    model = load_custom_model(model_path,
                              model_name='/model.json',
                              weights="/final_weights.h5")
    model = init_predictor(model)
    classes = {j: i for i, j in enumerate(get_lexicon())}
    inverse_classes = {v: k for k, v in classes.items()}

    decoder = DecodeCTCPred(top_paths=1,
                            beam_width=10,
                            inverse_classes=inverse_classes)

    img_size = (imgh, imgW) + (1, )

    if validate:
        if mjsynth:
            fnames = open(os.path.join(image_path, val_fname), "r").readlines()
            fnames = np.array(parse_mjsynth(image_path, fnames))
        else:
            fnames = np.array([
                os.path.join(dp, f)
Ejemplo n.º 3
0
def main(index_dir, silent, context_size, folder, use_gender):
    doc_path = os.path.join(index_dir, 'docs.list')
    lex_path = os.path.join(index_dir, 'words.lex')
    idx_path = os.path.join(index_dir, 'index.bin')

    documents = Documents.load(doc_path)
    lexicon = Lexicon.load(lex_path)

    words = get_lexicon()
    stop_words = set(
        list(STOP_WORDS) + [
            "know", "don", "ve", "say", "way", "said", "ll", "think", "thing",
            "don’t", "like", "got", "people", "going", "talk", "right",
            "happened", ">>"
        ])
    print("Stop words", stop_words)

    doc_idxs = range(144, 246923)
    word_idx_dic = {}
    idx_counter = 0

    # Create folder
    if not os.path.exists(folder):
        os.makedirs(folder)
    # Create stemmer
    stemmer = WordNetLemmatizer()
    with CaptionIndex(idx_path, lexicon, documents) as index:
        for doc_id in tqdm.tqdm(doc_idxs):
            dic = {}
            count = 1
            if use_gender:
                intervals_gender = gender_to_time(str(doc_id), gender_reqs)
                postings = []
                for t1, t2 in intervals_gender:
                    postings.extend(index.intervals(int(doc_id), t1, t2))
            else:
                postings = index.intervals(int(doc_id))

            starttime = None

            for p in postings:
                if starttime is None:
                    starttime = p.start

                # Cut after 30s
                if p.end - starttime > 30 * count:
                    pickle.dump(
                        dic,
                        open(
                            os.path.join(
                                folder,
                                'Doc_%d_Chunk_%d.p' % (doc_id, count - 1)),
                            'wb'))
                    dic = {}
                    count += 1
                    starttime = p.end

                # Get words in posting
                tokens = index.tokens(0, p.idx, p.len)
                if not tokens:
                    continue
                for token in tokens:
                    word = words[token]
                    # stemmed_word = stemmer.stem(word)
                    if word not in stop_words and len(word) > 1:
                        stemmed_word = stemmer.lemmatize(word)
                        # print("Word {} -> {}".format(word, stemmed_word))
                        if stemmed_word not in word_idx_dic.keys():
                            word_idx_dic[stemmed_word] = idx_counter
                            idx_counter += 1
                        idx_token = word_idx_dic[stemmed_word]
                        if idx_token in dic:
                            dic[idx_token] += 1
                        else:
                            dic[idx_token] = 1
    pickle.dump(word_idx_dic, open(os.path.join(folder, "word_idx.p"), "wb"))
Ejemplo n.º 4
0
def process_train(corpus_train_path,corpus_test_path,prf_file,base_model_weight=None,flag=6):

    # 训练语料
    raw_train_file = [corpus_train_path + os.sep + type_path + os.sep + type_file \
                      for type_path in os.listdir(corpus_train_path) \
                      for type_file in os.listdir(corpus_train_path + os.sep + type_path)]

    raw_test_file = [corpus_test_path + os.sep + type_path + os.sep + type_file \
                      for type_path in os.listdir(corpus_test_path) \
                      for type_file in os.listdir(corpus_test_path + os.sep + type_path)]

    if flag == 4:# 0 为padding的label 4tag
        label_2_index = {'Pad': 0, 'B': 1, 'M': 2, 'E': 3, 'S': 4, 'Unk': 5}
        index_2_label = {0: 'Pad', 1: 'B', 2: 'M', 3: 'E', 4: 'S', 5: 'Unk'}
        utils.process_data(raw_train_file, 'train.data')
        utils.process_data(raw_test_file, 'test.data')
    else: # 6tag
        label_2_index = {'Pad': 0, 'B': 1, 'B2': 2, 'B3': 3, 'M': 4, 'E': 5, 'S': 6, 'Unk': 7}
        index_2_label = {0: 'Pad', 1: 'B', 2: 'B2', 3: 'B3', 4: 'M', 5: 'E', 6: 'S', 7: 'Unk'}
        utils.process_dataB(raw_train_file, 'train.data')
        utils.process_dataB(raw_test_file, 'test.data')
    
    class_label_count = len(label_2_index)

    train_documents = utils.create_documents('train.data')
    test_documents = utils.create_documents('test.data')
    # 生成词典
    lexicon, lexicon_reverse = utils.get_lexicon(train_documents+test_documents)
    # 词典内字符个数
    print(len(lexicon), len(lexicon_reverse))

    print(len(test_documents))  # 测试语料划分句子个数
    print(len(train_documents)) # 训练语料划分句子个数

    #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-50.m') #size = 50
    #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005.m') #size = 100
    #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-150.m') #size = 150

    embedding_model = gensim.models.Word2Vec.load(r'model_embedding_pku_100.m') #size = 200
    embedding_size = embedding_model.vector_size
    print(embedding_size)

    # 预训练词向量
    embedding_weights = utils.create_embedding(embedding_model, embedding_size, lexicon_reverse)
    print(embedding_weights.shape)
    
    train_data_list, train_label_list, train_index_list=utils.create_matrix(train_documents,lexicon,label_2_index)
    test_data_list, test_label_list, test_index_list=utils.create_matrix(test_documents,lexicon,label_2_index)
    

    print(len(train_data_list), len(train_label_list), len(train_index_list))
    print(len(test_data_list), len(test_label_list), len(test_index_list))
    # print(train_data_list[0])
    # print(train_label_list[0])
    #查看句子长度分布
    #print("查看句子长度分布")
    #visualization.plot_sentence_length(train_data_list+test_data_list,train_label_list+test_label_list)

    max_len = max(map(len, train_data_list))
    print('maxlen:', max_len)
    #if max_len > 64:
    #    max_len = 64
    print('maxlen:', max_len)

    train_data_array, train_label_list_padding = utils.padding_sentences(train_data_list, train_label_list, max_len) #定长 都是二维数据
    test_data_array, test_label_list_padding = utils.padding_sentences(test_data_list, test_label_list, max_len)

    print(train_data_array.shape)
    print(test_data_array.shape)
    #print(train_data_array[0])

    train_label_array = np_utils.to_categorical(train_label_list_padding, class_label_count). \
        reshape((len(train_label_list_padding), len(train_label_list_padding[0]), -1))

    test_label_array = np_utils.to_categorical(test_label_list_padding, class_label_count). \
        reshape((len(test_label_list_padding), len(test_label_list_padding[0]), -1))  # 实现多分类问题  变成三维数据
    # 测试用的句子个数 * 句子长度 * 6
    print(train_label_array.shape)
    print(test_label_array.shape)

    # model
    model = CNN_Bilstm_Crf(max_len, len(lexicon), class_label_count, embedding_weights, embedding_size, model_type)
    print(model.input_shape)
    print(model.output_shape)
    model.summary()
    model_name = 'model_%d.png'%model_type
    #plot_model(model, to_file=model_name, show_shapes=True, show_layer_names=True)

    train_nums = len(train_data_array)  # 对应的train_data_list填充0后就是 train_data_array  填充0后的字在字典中的索引

    train_array, val_array = train_data_array[:int(train_nums * 0.9)], train_data_array[int(train_nums * 0.9):]  # 0.9的行用于训练 0.1的行用于防止过拟合
    train_label, val_label = train_label_array[:int(train_nums * 0.9)], train_label_array[int(train_nums * 0.9):]

    checkpointer = ModelCheckpoint(filepath='train_model_pku_100_m6.hdf5', verbose=1, \
                                   save_best_only=True, monitor='val_loss', mode='auto')

    hist = model.fit(train_array, train_label, batch_size=256, epochs=4, verbose=1,validation_data=(val_array,val_label),callbacks=[checkpointer])

    # save model
    model.save_weights('train_model_pku_100_m6.hdf5')

    print(hist.history['val_loss'])  # 记录下每次的平均损失大小
    best_model_epoch = np.argmin(hist.history['val_loss'])
    print('best_model_epoch:', best_model_epoch)

    # 可视化loss acc
    #visualization.plot_acc_loss(hist)
    #visualization.plot_acc(hist)
    #visualization.plot_loss(hist)

    print(hist.history)

    model.load_weights('train_model_pku_100_m6.hdf5')
    # test_data_array 是测试句子个数 * 句子索引中各字在字典中(填充0后)的长度(填充0后)
    test_y_pred = model.predict(test_data_array,batch_size=256,verbose=1) # 本函数按batch获得输入数据对应的输出,函数的返回值是预测值的numpy array
    print("test_y_pred.shape:")  # 测试句子个数 * 测试句子长度 * 5
    print(test_y_pred.shape) #句子个数 * 句子长度 * 5
	# pred_label是预测出的标签 [0,0,....,1,2,3,1]  句子个数 * 句子长度
    pred_label = np.argmax(test_y_pred,axis=2)  # 预测出的测试句子个数 * 句子长度

    # save lexicon
    pickle.dump([lexicon, lexicon_reverse, max_len, index_2_label], open('lexicon_pku_100_m6.pkl', 'wb'))

    K.clear_session()  # 清除session中的缓存数据
    # 生成输出文档
    # 字典大小 lexicon_reverse: {index:char}
    real_text_list, pred_text_list, real_label_list, pred_label_list = utils.create_pred_text( \
        lexicon_reverse, test_data_array, pred_label, test_label_list_padding, test_index_list, class_label_count)
    # {index:char}, 测试句子个数 * 句子长度(填充0后),# 预测出的测试句子个数 * 句子长度,test_label_list_padding对应标签填充0后的数据,每一行的索引
    # 写进文件
    utils.write_2_file(real_text_list, pred_text_list)
    # score
    F = score.prf_score('real_text.txt', 'pred_text.txt', prf_file,model_type, best_model_epoch,class_label_count)  # 返回平均值
Ejemplo n.º 5
0

__author__ = "Olivares Castillo José Luis"

#tf.enable_eager_execution()
tf.logging.set_verbosity(tf.logging.INFO)

print("TensorFlow version: {}".format(tf.VERSION))
#print("Eager execution: {}".format(tf.executing_eagerly()))

if tf.test.gpu_device_name():
    print("GPU disponible")

#source_lex = "en-it.test"
source_lex = "es-na.test"
words_scr_lexicon, words_trg_lexicon = utils.get_lexicon(source_lex)
print("size of lexicon:", set(words_scr_lexicon).__len__())
#print(len(words_scr_lexicon), len(words_trg_lexicon))


source_str = "es.n2v"
target_str = "na.n2v"
#source_str = "es.norm.n2v"
#source_str = "en.fst"
source_vec = utils.open_file(source_str)
words_src, source_vec = utils.read(source_vec, is_zipped=False)
# lista de palabras en español del lexicon semilla
eval_src = list(set(words_scr_lexicon))
src_vec = utils.get_vectors(eval_src, words_src, source_vec)
print("source_vec: " + source_str)
#print(src_vec.shape)