def prepare_sentence_data(datapath, vocab_path, embedding_path=None, embedding='glove', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, to_lower=True, sort_by_len=False): assert len(datapath) == 1, "data paths should include train, dev and test path" (train_x, train_y, train_prompts), vocab, overal_maxlen, overal_maxnum = reader.get_data(datapath, vocab_path, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False) X_train, y_train, mask_train = utils.padding_sentence_sequences(train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True) train_mean = y_train.mean(axis=0) train_std = y_train.std(axis=0) # Convert scores to boundary of [0 1] for training and evaluation (loss calculation) ####################################### #这里我改了 prompt_id ->train_prompts ####################################### if prompt_id ==-1: Y_train = reader.get_model_friendly_scores(y_train, train_prompts) scaled_train_mean = Y_train.mean() elif prompt_id!=-1: Y_train = reader.get_model_friendly_scores(y_train, prompt_id) scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id) # print Y_train.shape logger.info('Statistics:') logger.info(' train X shape: ' + str(X_train.shape)) logger.info(' train Y shape: ' + str(Y_train.shape)) logger.info(' train_y mean: %s, stdev: %s, train_y mean after scaling: %s' % (str(train_mean), str(train_std), str(scaled_train_mean))) if embedding_path: embedd_dict, embedd_dim, _ = utils.load_word_embedding_dict(embedding, embedding_path, vocab, logger, embedd_dim) embedd_matrix = utils.build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True) else: embedd_matrix = None return (X_train, Y_train, mask_train), vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, scaled_train_mean
logger.info("Padding Training set ...") char_index_train_pad = dp.construct_padded_char( char_index_train, char_alphabet, max_sent_length=max_length, max_char_per_word=max_char_per_word) logger.info("Padding Dev set ...") char_index_dev_pad = dp.construct_padded_char( char_index_dev, char_alphabet, max_sent_length=max_length, max_char_per_word=max_char_per_word) #logger.info("Generating data with fine tuning...") embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( embedding, embedding_path, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) #Create an embedding table where if the word from training/train/dev set is in glove , then assign glove values else assign random values embedd_table = dp.build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless) word_vocab = word_alphabet.instances word_vocab_size = len(word_vocab) char_vocab = char_alphabet.instances char_vocab_size = len(char_vocab) num_classes = len( label_alphabet.instances) + 1 #to account for zero index we dont use #logger.info("length of the embedding table is %d" , embedd_table.shape[0]) #Store the parameters for loading in test set Flags_Dict['sequence_length'] = max_length
config["word_length"] = max_char_per_word logger.info("set Maximum character length to %d" % max_char_per_word) logger.info("Padding Training set ...") char_index_train_pad = de.construct_padded_char( char_index_train, char_alphabet, max_sent_length=max_length, max_char_per_word=max_char_per_word) logger.info("Padding Dev set ...") char_index_dev_pad = de.construct_padded_char( char_index_dev, char_alphabet, max_sent_length=max_length, max_char_per_word=max_char_per_word) embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( "glove", FLAGS.embedding_path, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) embedd_table = de.build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless) char_embedd_table = de.build_char_embedd_table(char_alphabet, config["char_embded_size"]) logger.info("build embedding complete") ner = RNN_CNNs(config, embedd_table, char_embedd_table) logger.info("Model Created") f1_s = open("f1.txt", "w") dev_prediction = ner.transform(word_index_sentences_dev_pad, char_index_dev_pad, dev_seq_length) f1 = _f1(config, dev_prediction, label_index_sentences_dev_pad, dev_sent_len, "micro") print("\nEvaluate:\n")
def prepare_sentence_data(datapaths, embedding_path=None, embedding='word2vec', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, \ to_lower=True, sort_by_len=False, vocab_path=None, score_index=6): assert len( datapaths) == 4, "data paths should include train, dev and test path" (train_x, train_y, train_prompts, train_ids), (dev_x, dev_y, dev_prompts, dev_ids), (test_x, test_y, test_prompts, test_ids), vocab, overal_maxlen, overal_maxnum = \ reader.get_data(datapaths, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False, vocab_path=None, score_index=6) train_d, max_sentnum = reader.read_description(datapaths[3], vocab, len(train_x), tokenize_text=True, to_lower=True) dev_d, max_sentnum = reader.read_description(datapaths[3], vocab, len(dev_x), tokenize_text=True, to_lower=True) test_d, max_sentnum = reader.read_description(datapaths[3], vocab, len(test_x), tokenize_text=True, to_lower=True) X_train, y_train, mask_train = utils.padding_sentence_sequences( train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True) X_dev, y_dev, mask_dev = utils.padding_sentence_sequences( dev_x, dev_y, overal_maxnum, overal_maxlen, post_padding=True) X_test, y_test, mask_test = utils.padding_sentence_sequences( test_x, test_y, overal_maxnum, overal_maxlen, post_padding=True) D_train, mask_d_train = utils.padding_des_sequences(train_d, max_sentnum, overal_maxlen, post_padding=True) D_dev, mask_d_dev = utils.padding_des_sequences(dev_d, max_sentnum, overal_maxlen, post_padding=True) D_test, mask_d_test = utils.padding_des_sequences(test_d, max_sentnum, overal_maxlen, post_padding=True) if prompt_id: train_pmt = np.array(train_prompts, dtype='int32') dev_pmt = np.array(dev_prompts, dtype='int32') test_pmt = np.array(test_prompts, dtype='int32') train_mean = y_train.mean(axis=0) train_std = y_train.std(axis=0) dev_mean = y_dev.mean(axis=0) dev_std = y_dev.std(axis=0) test_mean = y_test.mean(axis=0) test_std = y_test.std(axis=0) # We need the dev and test sets in the original scale for evaluation # dev_y_org = y_dev.astype(reader.get_ref_dtype()) # test_y_org = y_test.astype(reader.get_ref_dtype()) # Convert scores to boundary of [0 1] for training and evaluation (loss calculation) Y_train = reader.get_model_friendly_scores(y_train, prompt_id) Y_dev = reader.get_model_friendly_scores(y_dev, prompt_id) Y_test = reader.get_model_friendly_scores(y_test, prompt_id) scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id) # print Y_train.shape logger.info('Statistics:') logger.info(' train X shape: ' + str(X_train.shape)) logger.info(' dev X shape: ' + str(X_dev.shape)) logger.info(' test X shape: ' + str(X_test.shape)) logger.info(' train Y shape: ' + str(Y_train.shape)) logger.info(' dev Y shape: ' + str(Y_dev.shape)) logger.info(' test Y shape: ' + str(Y_test.shape)) logger.info( ' train_y mean: %s, stdev: %s, train_y mean after scaling: %s' % (str(train_mean), str(train_std), str(scaled_train_mean))) if embedding_path: embedd_dict, embedd_dim, _ = utils.load_word_embedding_dict( embedding, embedding_path, vocab, logger, embedd_dim) embedd_matrix = utils.build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True) else: embedd_matrix = None return (X_train, Y_train, D_train, mask_train, train_ids), (X_dev, Y_dev, D_dev, mask_dev, dev_ids), (X_test, Y_test, D_test ,mask_test, test_ids), \ vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, max_sentnum, scaled_train_mean
def prepare_sentence_data( datapaths, embedding_path=None, embedding='word2vec', emb_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, to_lower=True, vocab_path=None, score_index=6, need_context=True ): assert len(datapaths) == 3, "data paths should include train, dev and test path" (train_x, train_y, train_prompts, train_text), \ (dev_x, dev_y, dev_prompts, dev_text), \ (test_x, test_y, test_prompts, test_text), \ vocab, overall_maxlen, overall_maxnum = \ reader.get_data( datapaths, prompt_id, vocab_size, tokenize_text, to_lower, vocab_path, score_index) X_train, y_train, mask_train = utils.padding_sentence_sequences(train_x, train_y, overall_maxnum, overall_maxlen, post_padding=True) X_dev, y_dev, mask_dev = utils.padding_sentence_sequences(dev_x, dev_y, overall_maxnum, overall_maxlen, post_padding=True) X_test, y_test, mask_test = utils.padding_sentence_sequences(test_x, test_y, overall_maxnum, overall_maxlen, post_padding=True) if need_context: context, context_len, context_num = reader.get_context(prompt_id, vocab, to_lower) else: # Dummy context context = [[0]] context_len = 1 context_num = 1 train_context = [context] * len(train_x) dev_context = [context] * len(dev_x) test_context = [context] * len(test_x) train_context, _, _ = utils.padding_sentence_sequences(train_context, train_y, context_num, context_len, post_padding=True) dev_context, _, _ = utils.padding_sentence_sequences(dev_context, dev_y, context_num, context_len, post_padding=True) test_context, _, _ = utils.padding_sentence_sequences(test_context, test_y, context_num, context_len, post_padding=True) train_mean = y_train.mean(axis=0) train_std = y_train.std(axis=0) dev_mean = y_dev.mean(axis=0) dev_std = y_dev.std(axis=0) test_mean = y_test.mean(axis=0) test_std = y_test.std(axis=0) # We need the dev and test sets in the original scale for evaluation # dev_y_org = y_dev.astype(reader.get_ref_dtype()) # test_y_org = y_test.astype(reader.get_ref_dtype()) # Convert scores to boundary of [0 1] for training and evaluation (loss calculation) Y_train = utils.get_model_friendly_scores(y_train, prompt_id) Y_dev = utils.get_model_friendly_scores(y_dev, prompt_id) Y_test = utils.get_model_friendly_scores(y_test, prompt_id) scaled_train_mean = utils.get_model_friendly_scores(train_mean, prompt_id) scaled_dev_mean = utils.get_model_friendly_scores(dev_mean, prompt_id) scaled_test_mean = utils.get_model_friendly_scores(test_mean, prompt_id) # print Y_train.shape logger.info('Statistics:') logger.info(' train X shape: ' + str(X_train.shape)) logger.info(' dev X shape: ' + str(X_dev.shape)) logger.info(' test X shape: ' + str(X_test.shape)) if need_context: logger.info(' train context shape: ' + str(train_context.shape)) logger.info(' dev context shape: ' + str(dev_context.shape)) logger.info(' test context shape: ' + str(test_context.shape)) logger.info(' train Y shape: ' + str(Y_train.shape)) logger.info(' dev Y shape: ' + str(Y_dev.shape)) logger.info(' test Y shape: ' + str(Y_test.shape)) logger.info(' train_y mean: %s, stdev: %s, train_y mean after scaling: %s' % (str(train_mean), str(train_std), str(scaled_train_mean))) logger.info(' dev_y mean: %s, stdev: %s, dev_y mean after scaling: %s' % (str(dev_mean), str(dev_std), str(scaled_dev_mean))) logger.info(' test_y mean: %s, stdev: %s, test_y mean after scaling: %s' % (str(test_mean), str(test_std), str(scaled_test_mean))) if embedding_path: emb_dict, emb_dim, _ = utils.load_word_embedding_dict(embedding, embedding_path, vocab, logger, emb_dim) emb_matrix = utils.build_embedding_table(vocab, emb_dict, emb_dim, logger, caseless=True) else: emb_matrix = None return (X_train, Y_train, mask_train, train_context, train_text), \ (X_dev, Y_dev, mask_dev, dev_context, dev_text), \ (X_test, Y_test, mask_test, test_context, test_text), \ vocab, len(vocab), emb_matrix, overall_maxlen, overall_maxnum, scaled_train_mean, context_len, context_num