def prepare_sentence_data(datapath, vocab_path, embedding_path=None, embedding='glove', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, to_lower=True, sort_by_len=False):

    assert len(datapath) == 1, "data paths should include train, dev and test path"
    (train_x, train_y, train_prompts), vocab, overal_maxlen, overal_maxnum = reader.get_data(datapath, vocab_path, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False)
        
        

    X_train, y_train, mask_train = utils.padding_sentence_sequences(train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True)
    
        
    train_mean = y_train.mean(axis=0)
    train_std = y_train.std(axis=0)
    
    # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
    #######################################
    #这里我改了   prompt_id ->train_prompts
    #######################################
    if prompt_id ==-1:
        Y_train = reader.get_model_friendly_scores(y_train, train_prompts)
        scaled_train_mean = Y_train.mean()
    elif prompt_id!=-1:
        Y_train = reader.get_model_friendly_scores(y_train, prompt_id)
        scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id)
    # print Y_train.shape

    logger.info('Statistics:')

    logger.info('  train X shape: ' + str(X_train.shape))
    logger.info('  train Y shape: ' + str(Y_train.shape))
    logger.info('  train_y mean: %s, stdev: %s, train_y mean after scaling: %s' %
                (str(train_mean), str(train_std), str(scaled_train_mean)))

    if embedding_path:
        embedd_dict, embedd_dim, _ = utils.load_word_embedding_dict(embedding, embedding_path, vocab, logger, embedd_dim)
        embedd_matrix = utils.build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True)
    else:
        embedd_matrix = None

    return (X_train, Y_train, mask_train), vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, scaled_train_mean
Esempio n. 2
0
logger.info("Padding Training set ...")
char_index_train_pad = dp.construct_padded_char(
    char_index_train,
    char_alphabet,
    max_sent_length=max_length,
    max_char_per_word=max_char_per_word)
logger.info("Padding Dev set ...")
char_index_dev_pad = dp.construct_padded_char(
    char_index_dev,
    char_alphabet,
    max_sent_length=max_length,
    max_char_per_word=max_char_per_word)

#logger.info("Generating data with fine tuning...")
embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
    embedding, embedding_path, logger)
logger.info("Dimension of embedding is %d, Caseless: %d" %
            (embedd_dim, caseless))
#Create an embedding table where if the word from training/train/dev set is in glove , then assign glove values else assign random values
embedd_table = dp.build_embedd_table(word_alphabet, embedd_dict, embedd_dim,
                                     caseless)
word_vocab = word_alphabet.instances
word_vocab_size = len(word_vocab)
char_vocab = char_alphabet.instances
char_vocab_size = len(char_vocab)
num_classes = len(
    label_alphabet.instances) + 1  #to account for zero index we dont use
#logger.info("length of the embedding table is  %d" , embedd_table.shape[0])

#Store the parameters for loading in test set
Flags_Dict['sequence_length'] = max_length
Esempio n. 3
0
    config["word_length"] = max_char_per_word
    logger.info("set Maximum character length to %d" % max_char_per_word)
    logger.info("Padding Training set ...")
    char_index_train_pad = de.construct_padded_char(
        char_index_train,
        char_alphabet,
        max_sent_length=max_length,
        max_char_per_word=max_char_per_word)
    logger.info("Padding Dev set ...")
    char_index_dev_pad = de.construct_padded_char(
        char_index_dev,
        char_alphabet,
        max_sent_length=max_length,
        max_char_per_word=max_char_per_word)

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
        "glove", FLAGS.embedding_path, logger)
    logger.info("Dimension of embedding is %d, Caseless: %d" %
                (embedd_dim, caseless))
    embedd_table = de.build_embedd_table(word_alphabet, embedd_dict,
                                         embedd_dim, caseless)
    char_embedd_table = de.build_char_embedd_table(char_alphabet,
                                                   config["char_embded_size"])
    logger.info("build embedding complete")
    ner = RNN_CNNs(config, embedd_table, char_embedd_table)
    logger.info("Model Created")
    f1_s = open("f1.txt", "w")
    dev_prediction = ner.transform(word_index_sentences_dev_pad,
                                   char_index_dev_pad, dev_seq_length)
    f1 = _f1(config, dev_prediction, label_index_sentences_dev_pad,
             dev_sent_len, "micro")
    print("\nEvaluate:\n")
Esempio n. 4
0
def prepare_sentence_data(datapaths, embedding_path=None, embedding='word2vec', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, \
                         to_lower=True, sort_by_len=False, vocab_path=None, score_index=6):

    assert len(
        datapaths) == 4, "data paths should include train, dev and test path"
    (train_x, train_y, train_prompts, train_ids), (dev_x, dev_y, dev_prompts, dev_ids), (test_x, test_y, test_prompts, test_ids), vocab, overal_maxlen, overal_maxnum = \
        reader.get_data(datapaths, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False, vocab_path=None, score_index=6)

    train_d, max_sentnum = reader.read_description(datapaths[3],
                                                   vocab,
                                                   len(train_x),
                                                   tokenize_text=True,
                                                   to_lower=True)
    dev_d, max_sentnum = reader.read_description(datapaths[3],
                                                 vocab,
                                                 len(dev_x),
                                                 tokenize_text=True,
                                                 to_lower=True)
    test_d, max_sentnum = reader.read_description(datapaths[3],
                                                  vocab,
                                                  len(test_x),
                                                  tokenize_text=True,
                                                  to_lower=True)

    X_train, y_train, mask_train = utils.padding_sentence_sequences(
        train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True)
    X_dev, y_dev, mask_dev = utils.padding_sentence_sequences(
        dev_x, dev_y, overal_maxnum, overal_maxlen, post_padding=True)
    X_test, y_test, mask_test = utils.padding_sentence_sequences(
        test_x, test_y, overal_maxnum, overal_maxlen, post_padding=True)

    D_train, mask_d_train = utils.padding_des_sequences(train_d,
                                                        max_sentnum,
                                                        overal_maxlen,
                                                        post_padding=True)
    D_dev, mask_d_dev = utils.padding_des_sequences(dev_d,
                                                    max_sentnum,
                                                    overal_maxlen,
                                                    post_padding=True)
    D_test, mask_d_test = utils.padding_des_sequences(test_d,
                                                      max_sentnum,
                                                      overal_maxlen,
                                                      post_padding=True)

    if prompt_id:
        train_pmt = np.array(train_prompts, dtype='int32')
        dev_pmt = np.array(dev_prompts, dtype='int32')
        test_pmt = np.array(test_prompts, dtype='int32')

    train_mean = y_train.mean(axis=0)
    train_std = y_train.std(axis=0)
    dev_mean = y_dev.mean(axis=0)
    dev_std = y_dev.std(axis=0)
    test_mean = y_test.mean(axis=0)
    test_std = y_test.std(axis=0)

    # We need the dev and test sets in the original scale for evaluation
    # dev_y_org = y_dev.astype(reader.get_ref_dtype())
    # test_y_org = y_test.astype(reader.get_ref_dtype())

    # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
    Y_train = reader.get_model_friendly_scores(y_train, prompt_id)
    Y_dev = reader.get_model_friendly_scores(y_dev, prompt_id)
    Y_test = reader.get_model_friendly_scores(y_test, prompt_id)
    scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id)
    # print Y_train.shape

    logger.info('Statistics:')

    logger.info('  train X shape: ' + str(X_train.shape))
    logger.info('  dev X shape:   ' + str(X_dev.shape))
    logger.info('  test X shape:  ' + str(X_test.shape))

    logger.info('  train Y shape: ' + str(Y_train.shape))
    logger.info('  dev Y shape:   ' + str(Y_dev.shape))
    logger.info('  test Y shape:  ' + str(Y_test.shape))

    logger.info(
        '  train_y mean: %s, stdev: %s, train_y mean after scaling: %s' %
        (str(train_mean), str(train_std), str(scaled_train_mean)))

    if embedding_path:
        embedd_dict, embedd_dim, _ = utils.load_word_embedding_dict(
            embedding, embedding_path, vocab, logger, embedd_dim)
        embedd_matrix = utils.build_embedd_table(vocab,
                                                 embedd_dict,
                                                 embedd_dim,
                                                 logger,
                                                 caseless=True)
    else:
        embedd_matrix = None

    return (X_train, Y_train, D_train, mask_train, train_ids), (X_dev, Y_dev, D_dev, mask_dev, dev_ids), (X_test, Y_test, D_test ,mask_test, test_ids), \
            vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, max_sentnum, scaled_train_mean
Esempio n. 5
0
def prepare_sentence_data(
        datapaths,
        embedding_path=None,
        embedding='word2vec',
        emb_dim=100,
        prompt_id=1,
        vocab_size=0,
        tokenize_text=True,
        to_lower=True,
        vocab_path=None,
        score_index=6,
        need_context=True
):
    assert len(datapaths) == 3, "data paths should include train, dev and test path"
    (train_x, train_y, train_prompts, train_text), \
    (dev_x, dev_y, dev_prompts, dev_text), \
    (test_x, test_y, test_prompts, test_text), \
    vocab, overall_maxlen, overall_maxnum = \
        reader.get_data(
            datapaths,
            prompt_id,
            vocab_size,
            tokenize_text,
            to_lower,
            vocab_path,
            score_index)

    X_train, y_train, mask_train = utils.padding_sentence_sequences(train_x, train_y, overall_maxnum, overall_maxlen,
                                                                    post_padding=True)
    X_dev, y_dev, mask_dev = utils.padding_sentence_sequences(dev_x, dev_y, overall_maxnum, overall_maxlen,
                                                              post_padding=True)
    X_test, y_test, mask_test = utils.padding_sentence_sequences(test_x, test_y, overall_maxnum, overall_maxlen,
                                                                 post_padding=True)

    if need_context:
        context, context_len, context_num = reader.get_context(prompt_id, vocab, to_lower)
    else:
        # Dummy context
        context = [[0]]
        context_len = 1
        context_num = 1
    train_context = [context] * len(train_x)
    dev_context = [context] * len(dev_x)
    test_context = [context] * len(test_x)

    train_context, _, _ = utils.padding_sentence_sequences(train_context, train_y, context_num, context_len, post_padding=True)
    dev_context, _, _ = utils.padding_sentence_sequences(dev_context, dev_y, context_num, context_len, post_padding=True)
    test_context, _, _ = utils.padding_sentence_sequences(test_context, test_y, context_num, context_len, post_padding=True)

    train_mean = y_train.mean(axis=0)
    train_std = y_train.std(axis=0)
    dev_mean = y_dev.mean(axis=0)
    dev_std = y_dev.std(axis=0)
    test_mean = y_test.mean(axis=0)
    test_std = y_test.std(axis=0)

    # We need the dev and test sets in the original scale for evaluation
    # dev_y_org = y_dev.astype(reader.get_ref_dtype())
    # test_y_org = y_test.astype(reader.get_ref_dtype())

    # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
    Y_train = utils.get_model_friendly_scores(y_train, prompt_id)
    Y_dev = utils.get_model_friendly_scores(y_dev, prompt_id)
    Y_test = utils.get_model_friendly_scores(y_test, prompt_id)
    scaled_train_mean = utils.get_model_friendly_scores(train_mean, prompt_id)
    scaled_dev_mean = utils.get_model_friendly_scores(dev_mean, prompt_id)
    scaled_test_mean = utils.get_model_friendly_scores(test_mean, prompt_id)
    # print Y_train.shape

    logger.info('Statistics:')

    logger.info('  train X shape: ' + str(X_train.shape))
    logger.info('  dev X shape:   ' + str(X_dev.shape))
    logger.info('  test X shape:  ' + str(X_test.shape))

    if need_context:
        logger.info('  train context shape: ' + str(train_context.shape))
        logger.info('  dev context shape: ' + str(dev_context.shape))
        logger.info('  test context shape: ' + str(test_context.shape))

    logger.info('  train Y shape: ' + str(Y_train.shape))
    logger.info('  dev Y shape:   ' + str(Y_dev.shape))
    logger.info('  test Y shape:  ' + str(Y_test.shape))

    logger.info('  train_y mean: %s, stdev: %s, train_y mean after scaling: %s' %
                (str(train_mean), str(train_std), str(scaled_train_mean)))
    logger.info('  dev_y mean: %s, stdev: %s, dev_y mean after scaling: %s' %
                (str(dev_mean), str(dev_std), str(scaled_dev_mean)))
    logger.info('  test_y mean: %s, stdev: %s, test_y mean after scaling: %s' %
                (str(test_mean), str(test_std), str(scaled_test_mean)))

    if embedding_path:
        emb_dict, emb_dim, _ = utils.load_word_embedding_dict(embedding, embedding_path, vocab, logger, emb_dim)
        emb_matrix = utils.build_embedding_table(vocab, emb_dict, emb_dim, logger, caseless=True)
    else:
        emb_matrix = None

    return (X_train, Y_train, mask_train, train_context, train_text), \
           (X_dev, Y_dev, mask_dev, dev_context, dev_text), \
           (X_test, Y_test, mask_test, test_context, test_text), \
           vocab, len(vocab), emb_matrix, overall_maxlen, overall_maxnum, scaled_train_mean, context_len, context_num