Ejemplo n.º 1
0
def create_dictionary(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input-directory", type=str)
    parser.add_argument("-o", "--output-file", type=str)
    args = parser.parse_args(argv)
    sentences = load_directory_sentences(args.input_directory)
    dictionary = Dictionary()
    for sentence in sentences:
        dictionary.add_all(sentence.split(" "))
    dictionary.save(args.output_file)
Ejemplo n.º 2
0
def get_postag_data(config,
                    train_path,
                    dev_path,
                    vocab_path=None,
                    label_path=None):
    use_se_marker = config.use_se_marker
    raw_train_sents = get_sentences(train_path, use_se_marker)
    raw_dev_sents = get_sentences(dev_path, use_se_marker)
    word_to_embeddings = get_pretrained_embeddings(
        WORD_EMBEDDINGS[config.word_embedding])

    # Prepare word dictionary.
    word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN)
    if use_se_marker:
        word_dict.add_all([START_MARKER, END_MARKER])
    if vocab_path != None:
        with open(vocab_path, 'r') as f_vocab:
            for line in f_vocab:
                word_dict.add(line.strip())
            f_vocab.close()
        word_dict.accept_new = False
        print 'Load {} words. Dictionary freezed.'.format(word_dict.size())

    # Parpare label dictionary.
    label_dict = Dictionary()
    if label_path != None:
        with open(label_path, 'r') as f_labels:
            for line in f_labels:
                label_dict.add(line.strip())
            f_labels.close()
        label_dict.set_unknown_token(UNKNOWN_LABEL)
        label_dict.accept_new = False
        print 'Load {} labels. Dictionary freezed.'.format(label_dict.size())

    train_sents = [(string_sequence_to_ids(sent[0], word_dict, True,
                                           word_to_embeddings),
                    string_sequence_to_ids(sent[1], label_dict))
                   for sent in raw_train_sents]
    dev_sents = [(string_sequence_to_ids(sent[0], word_dict, True,
                                         word_to_embeddings),
                  string_sequence_to_ids(sent[1], label_dict))
                 for sent in raw_dev_sents]

    print("Extracted {} words and {} tags".format(word_dict.size(),
                                                  label_dict.size()))
    print("Max training sentence length: {}".format(
        max([len(s[0]) for s in train_sents])))
    print("Max development sentence length: {}".format(
        max([len(s[0]) for s in dev_sents])))
    word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str]
    word_embedding_shape = [len(word_embedding), len(word_embedding[0])]
    return (train_sents, dev_sents, word_dict, label_dict, [word_embedding],
            [word_embedding_shape])
Ejemplo n.º 3
0
def get_srl_data(config,
                 train_data_path,
                 dev_data_path,
                 vocab_path=None,
                 label_path=None):
    '''
    '''
    use_se_marker = config.use_se_marker
    raw_train_sents = get_srl_sentences(train_data_path, use_se_marker)
    raw_dev_sents = get_srl_sentences(dev_data_path, use_se_marker)
    word_to_embeddings = get_pretrained_embeddings(
        WORD_EMBEDDINGS[config.word_embedding])  # get pre-trained embeddings

    # Prepare word dictionary.
    word_dict = Dictionary(padding_token=PADDING_TOKEN,
                           unknown_token=UNKNOWN_TOKEN)
    if use_se_marker:
        word_dict.add_all([START_MARKER, END_MARKER])
    if vocab_path != None:
        with open(vocab_path, 'r') as f_vocab:
            for line in f_vocab:
                word_dict.add(line.strip())
            f_vocab.close()
        word_dict.accept_new = False
        print 'Load {} words. Dictionary freezed.'.format(word_dict.size())

    # Parpare label dictionary.
    label_dict = Dictionary()
    if label_path != None:
        with open(label_path, 'r') as f_labels:
            for line in f_labels:
                label_dict.add(line.strip())
            f_labels.close()
        label_dict.set_unknown_token(UNKNOWN_LABEL)
        label_dict.accept_new = False
        print 'Load {} labels. Dictionary freezed.'.format(label_dict.size())

    # Get tokens and labels: [sentence_id, word, predicate, label]
    train_sentences_ids = [sent[0] for sent in raw_train_sents]
    train_tokens = [
        string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings)
        for sent in raw_train_sents
    ]
    train_labels = [
        string_sequence_to_ids(sent[3], label_dict) for sent in raw_train_sents
    ]

    if label_dict.accept_new:
        label_dict.set_unknown_token(
            UNKNOWN_LABEL)  # train corpus contains the label 'O' ?
        label_dict.accept_new = False

    dev_sentences_ids = [sent[0] for sent in raw_dev_sents]
    dev_tokens = [
        string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings)
        for sent in raw_dev_sents
    ]
    dev_labels = [
        string_sequence_to_ids(sent[3], label_dict) for sent in raw_dev_sents
    ]
    print 'Total tokens in Dev dataset {}'.format(
        sum([len(sent[1]) for sent in raw_dev_sents]))
    # Get features
    print 'Extracting features'
    train_features, feature_shapes = features.get_srl_features(
        raw_train_sents, config)
    dev_features, feature_shapes2 = features.get_srl_features(
        raw_dev_sents, config)
    for f1, f2 in zip(feature_shapes, feature_shapes2):
        assert f1 == f2

    # For additional features. Unused now.
    feature_dicts = []
    for feature in config.features:
        feature_dicts.append(None)

    train_sents = []
    dev_sents = []
    for i in range(len(train_tokens)):
        train_sents.append((train_sentences_ids[i], ) + (train_tokens[i], ) +
                           tuple(train_features[i]) + (train_labels[i], ))
    for i in range(len(dev_tokens)):
        dev_sents.append((dev_sentences_ids[i], ) + (dev_tokens[i], ) +
                         tuple(dev_features[i]) + (dev_labels[i], ))

    print("Extraced {} words and {} tags".format(word_dict.size(),
                                                 label_dict.size()))
    print("Max training sentence length: {}".format(
        max([len(s[1]) for s in train_sents])))
    print("Max development sentence length: {}".format(
        max([len(s[1]) for s in dev_sents])))
    word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str]
    word_embedding_shape = [len(word_embedding), len(word_embedding[0])]
    return (train_sents, dev_sents, word_dict, label_dict,
            [word_embedding, None,
             None], [word_embedding_shape] + feature_shapes, [
                 word_dict,
             ] + feature_dicts)