Ejemplo n.º 1
0
def get_postag_data(config,
                    train_path,
                    dev_path,
                    vocab_path=None,
                    label_path=None):
    use_se_marker = config.use_se_marker
    raw_train_sents = get_sentences(train_path, use_se_marker)
    raw_dev_sents = get_sentences(dev_path, use_se_marker)
    word_to_embeddings = get_pretrained_embeddings(
        WORD_EMBEDDINGS[config.word_embedding])

    # Prepare word dictionary.
    word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN)
    if use_se_marker:
        word_dict.add_all([START_MARKER, END_MARKER])
    if vocab_path != None:
        with open(vocab_path, 'r') as f_vocab:
            for line in f_vocab:
                word_dict.add(line.strip())
            f_vocab.close()
        word_dict.accept_new = False
        print 'Load {} words. Dictionary freezed.'.format(word_dict.size())

    # Parpare label dictionary.
    label_dict = Dictionary()
    if label_path != None:
        with open(label_path, 'r') as f_labels:
            for line in f_labels:
                label_dict.add(line.strip())
            f_labels.close()
        label_dict.set_unknown_token(UNKNOWN_LABEL)
        label_dict.accept_new = False
        print 'Load {} labels. Dictionary freezed.'.format(label_dict.size())

    train_sents = [(string_sequence_to_ids(sent[0], word_dict, True,
                                           word_to_embeddings),
                    string_sequence_to_ids(sent[1], label_dict))
                   for sent in raw_train_sents]
    dev_sents = [(string_sequence_to_ids(sent[0], word_dict, True,
                                         word_to_embeddings),
                  string_sequence_to_ids(sent[1], label_dict))
                 for sent in raw_dev_sents]

    print("Extracted {} words and {} tags".format(word_dict.size(),
                                                  label_dict.size()))
    print("Max training sentence length: {}".format(
        max([len(s[0]) for s in train_sents])))
    print("Max development sentence length: {}".format(
        max([len(s[0]) for s in dev_sents])))
    word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str]
    word_embedding_shape = [len(word_embedding), len(word_embedding[0])]
    return (train_sents, dev_sents, word_dict, label_dict, [word_embedding],
            [word_embedding_shape])
Ejemplo n.º 2
0
def get_srl_data(config,
                 train_data_path,
                 dev_data_path,
                 vocab_path=None,
                 label_path=None):
    '''
    '''
    use_se_marker = config.use_se_marker
    raw_train_sents = get_srl_sentences(train_data_path, use_se_marker)
    raw_dev_sents = get_srl_sentences(dev_data_path, use_se_marker)
    word_to_embeddings = get_pretrained_embeddings(
        WORD_EMBEDDINGS[config.word_embedding])  # get pre-trained embeddings

    # Prepare word dictionary.
    word_dict = Dictionary(padding_token=PADDING_TOKEN,
                           unknown_token=UNKNOWN_TOKEN)
    if use_se_marker:
        word_dict.add_all([START_MARKER, END_MARKER])
    if vocab_path != None:
        with open(vocab_path, 'r') as f_vocab:
            for line in f_vocab:
                word_dict.add(line.strip())
            f_vocab.close()
        word_dict.accept_new = False
        print 'Load {} words. Dictionary freezed.'.format(word_dict.size())

    # Parpare label dictionary.
    label_dict = Dictionary()
    if label_path != None:
        with open(label_path, 'r') as f_labels:
            for line in f_labels:
                label_dict.add(line.strip())
            f_labels.close()
        label_dict.set_unknown_token(UNKNOWN_LABEL)
        label_dict.accept_new = False
        print 'Load {} labels. Dictionary freezed.'.format(label_dict.size())

    # Get tokens and labels: [sentence_id, word, predicate, label]
    train_sentences_ids = [sent[0] for sent in raw_train_sents]
    train_tokens = [
        string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings)
        for sent in raw_train_sents
    ]
    train_labels = [
        string_sequence_to_ids(sent[3], label_dict) for sent in raw_train_sents
    ]

    if label_dict.accept_new:
        label_dict.set_unknown_token(
            UNKNOWN_LABEL)  # train corpus contains the label 'O' ?
        label_dict.accept_new = False

    dev_sentences_ids = [sent[0] for sent in raw_dev_sents]
    dev_tokens = [
        string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings)
        for sent in raw_dev_sents
    ]
    dev_labels = [
        string_sequence_to_ids(sent[3], label_dict) for sent in raw_dev_sents
    ]
    print 'Total tokens in Dev dataset {}'.format(
        sum([len(sent[1]) for sent in raw_dev_sents]))
    # Get features
    print 'Extracting features'
    train_features, feature_shapes = features.get_srl_features(
        raw_train_sents, config)
    dev_features, feature_shapes2 = features.get_srl_features(
        raw_dev_sents, config)
    for f1, f2 in zip(feature_shapes, feature_shapes2):
        assert f1 == f2

    # For additional features. Unused now.
    feature_dicts = []
    for feature in config.features:
        feature_dicts.append(None)

    train_sents = []
    dev_sents = []
    for i in range(len(train_tokens)):
        train_sents.append((train_sentences_ids[i], ) + (train_tokens[i], ) +
                           tuple(train_features[i]) + (train_labels[i], ))
    for i in range(len(dev_tokens)):
        dev_sents.append((dev_sentences_ids[i], ) + (dev_tokens[i], ) +
                         tuple(dev_features[i]) + (dev_labels[i], ))

    print("Extraced {} words and {} tags".format(word_dict.size(),
                                                 label_dict.size()))
    print("Max training sentence length: {}".format(
        max([len(s[1]) for s in train_sents])))
    print("Max development sentence length: {}".format(
        max([len(s[1]) for s in dev_sents])))
    word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str]
    word_embedding_shape = [len(word_embedding), len(word_embedding[0])]
    return (train_sents, dev_sents, word_dict, label_dict,
            [word_embedding, None,
             None], [word_embedding_shape] + feature_shapes, [
                 word_dict,
             ] + feature_dicts)
def get_srl_data(config,
                 train_data_path,
                 dep_path,
                 dev_data_path,
                 vocab_path=None,
                 char_path=None,
                 label_path=None):
    # Load sentences (documents) from data paths respectively.
    raw_train_sents = get_srl_sentences(train_data_path)
    raw_dev_sents = get_srl_sentences(dev_data_path)
    # Load dev data
    eval_data = load_eval_data(dev_data_path)
    # Load pretrained embeddings
    word_embeddings = get_pretrained_embeddings(
        config.word_embedding)  # get pre-trained embeddings
    head_embeddings = get_pretrained_embeddings(config.head_embedding)

    # Prepare word embedding dictionary.
    word_dict = Dictionary(padding_token=PADDING_TOKEN,
                           unknown_token=UNKNOWN_TOKEN)
    # Prepare head embedding dictionary.
    head_dict = Dictionary(padding_token=PADDING_TOKEN,
                           unknown_token=UNKNOWN_TOKEN)
    # Prepare char dictionary.
    char_dict = Dictionary(padding_token=PADDING_TOKEN,
                           unknown_token=UNKNOWN_TOKEN)
    with open(char_path, 'r') as f_char:
        for line in f_char:
            char_dict.add(line.strip())
        f_char.close()
    char_dict.accept_new = False
    print 'Load {} chars, Dictionary freezed.'.format(char_dict.size())
    # Parpare SRL label dictionary.
    label_dict = Dictionary()
    label_dict.set_unknown_token(
        NULL_LABEL)  # train corpus contains the label 'O' ?
    if label_path is not None:
        with open(label_path, 'r') as f_labels:
            for line in f_labels:
                label_dict.add(line.strip())
            f_labels.close()
        label_dict.set_unknown_token(NULL_LABEL)
        label_dict.accept_new = False
        print 'Load {} labels. Dictionary freezed.'.format(label_dict.size())
    # Parpare SRL label dictionary.
    dep_label_dict = Dictionary()

    # Training data: Get tokens and labels: [sentence_id, word, predicate, label]
    train_samples = tokenize_data(raw_train_sents, word_dict, head_dict,
                                  char_dict, label_dict, False,
                                  word_embeddings, head_embeddings)
    # Data for dep Trees
    with Timer("Loading Dependency Trees"):
        dep_trees = SyntacticCONLL()
        dep_trees.read_from_file(dep_path, prune_ratio=config.dep_prune_ratio)
        dep_trees.tokenize_dep_trees(word_dict, char_dict, dep_label_dict,
                                     word_embeddings)

    # set dictionary freezed
    char_dict.accept_new, label_dict.accept_new, dep_label_dict.accept_new = False, False, False
    # Development data:
    dev_samples = tokenize_data(raw_dev_sents, word_dict, head_dict, char_dict,
                                label_dict, False, word_embeddings,
                                head_embeddings)

    # set word and head dict freezed.
    word_dict.accept_new, head_dict.accept_new = False, False

    print("Extract {} words and {} tags".format(word_dict.size(),
                                                label_dict.size()))
    print("Max training sentence length: {}".format(
        max([s[1] for s in train_samples])))
    print("Max development sentence length: {}".format(
        max([s[1] for s in dev_samples])))

    word_embedding = np.asarray(
        [word_embeddings[w] for w in word_dict.idx2str])
    word_embedding_shape = [len(word_embedding), len(word_embedding[0])]
    head_embedding = np.asarray(
        [head_embeddings[w] for w in head_dict.idx2str])
    head_embedding_shape = [len(head_embedding), len(head_embedding[0])]
    print("word embedding shape {}, head embedding shape {}".format(
        word_embedding_shape, head_embedding_shape))
    return (train_samples, dev_samples, dep_trees.sample_dep_data, eval_data,
            word_dict, head_dict, char_dict, label_dict, dep_label_dict,
            [word_embedding,
             head_embedding], [word_embedding_shape, head_embedding_shape])