def get_srl_test_data(filepath, config, word_dict, label_dict, allow_new_words=True):
    """get the test data from file"""
    word_dict.accept_new = allow_new_words
    if label_dict.accept_new:
        label_dict.set_unknown_token(UNKNOWN_LABEL)
        label_dict.accept_new = False

    if filepath != None and filepath != '':
        samples = get_srl_sentences(filepath, config.use_se_marker)
    else:
        samples = []
    word_to_embeddings = get_pretrained_embeddings(WORD_EMBEDDINGS[config.word_embedding])
    if allow_new_words:
        tokens = [string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings) for sent in samples]
    else:
        tokens = [string_sequence_to_ids(sent[1], word_dict, True) for sent in samples]

    test_sentences_ids = [sent[0] for sent in samples]
    labels = [string_sequence_to_ids(sent[3], label_dict) for sent in samples]
    srl_features, feature_shapes = features.get_srl_features(samples, config)

    sentences = []
    for i in range(len(tokens)):  # i is each sentence
        sentences.append((test_sentences_ids[i],) + (tokens[i],) + tuple(srl_features[i]) + (labels[i],))

    word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str]
    word_embedding_shape = [len(word_embedding), len(word_embedding[0])]
    return (sentences, [word_embedding, None, None], [word_embedding_shape, ] + feature_shapes)
Beispiel #2
0
def get_srl_test_data_gemb(filepath, config, word_dict, label_dict, allow_new_words=False):
  allow_new_words = False # should not make use of pretrained embeddings at test time
  word_dict.accept_new = allow_new_words
  if label_dict.accept_new:
    label_dict.set_unknown_token(UNKNOWN_LABEL)
    label_dict.accept_new = False
  
  if filepath != None and filepath != '': 
    samples = get_srl_sentences(filepath, config.use_se_marker)
  else:
    samples = []
  word_to_embeddings = get_pretrained_embeddings(WORD_EMBEDDINGS[config.word_embedding])
  if allow_new_words:
    tokens = [string_sequence_to_ids(sent[0], word_dict, True, word_to_embeddings) for sent in samples]
  else:
    # list of sent [list of ids[]]
    tokens = [string_sequence_to_ids(sent[0], word_dict, True) for sent in samples]
    
  labels = [string_sequence_to_ids(sent[2], label_dict) for sent in samples]
  srl_features, feature_shapes = features.get_srl_features(samples, config)
  
  sentences = []
  for i in range(len(tokens)):
    sentences.append((tokens[i],) + tuple(srl_features[i]) + (labels[i],))
    
  word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str]
  word_embedding_shape = [len(word_embedding), len(word_embedding[0])]

  # sentences: tuple(list[word_ids], list[feat_ids], list[emb_shapes], list[feat_shapes])
  return (sentences,  [word_embedding, None, None], [word_embedding_shape,] + feature_shapes)
Beispiel #3
0
def get_srl_test_data(filepath, config, word_dict, label_dict, lower_case=True, allow_new_words=True):
  word_dict.accept_new = allow_new_words
  if label_dict.accept_new:
    label_dict.set_unknown_token(UNKNOWN_LABEL)
    label_dict.accept_new = False

  if filepath != None and filepath != '':
    print "Getting sentences from",filepath
    samples = get_srl_sentences(filepath, config.use_se_marker)
  else:
    samples = []

  embeddings_file = config.embedding_file if config.embedding_file is not None else WORD_EMBEDDINGS[config.word_embedding]
  print "Reading", embeddings_file
  word_to_embeddings = get_pretrained_embeddings(embeddings_file)
  print "Done, got",len(word_to_embeddings)
  if allow_new_words:
    tokens = [string_sequence_to_ids(sent[0], word_dict, True, word_to_embeddings) for sent in samples]
  else:
    tokens = [string_sequence_to_ids(sent[0], word_dict, True) for sent in samples]

  # for i in range(5):
  #   print tokens[i], samples[i]

  labels = [string_sequence_to_ids(sent[2], label_dict) for sent in samples]
  srl_features, feature_shapes = features.get_srl_features(samples, config)

  sentences = []
  for i in range(len(tokens)):
    sentences.append((tokens[i],) + tuple(srl_features[i]) + (labels[i],))

  #word_embedding = [get_embeddings(w, word_to_embeddings) for w in word_dict.idx2str]
  word_embedding = [word_to_embeddings[w] if w in word_to_embeddings else word_to_embeddings[UNKNOWN_TOKEN] for w in word_dict.idx2str]
  # for i in range(10):
  #   w = word_dict.idx2str[i]
  #   print(w,word_embedding[i][:10])
  word_embedding_shape = [len(word_embedding), len(word_embedding[0])]

  return (sentences,  [word_embedding, None, None], [word_embedding_shape,] + feature_shapes)
    
Beispiel #4
0
def get_srl_data(config,
                 train_data_path,
                 dev_data_path,
                 vocab_path=None,
                 label_path=None):
    '''
    '''
    use_se_marker = config.use_se_marker
    raw_train_sents = get_srl_sentences(train_data_path, use_se_marker)
    raw_dev_sents = get_srl_sentences(dev_data_path, use_se_marker)
    word_to_embeddings = get_pretrained_embeddings(
        WORD_EMBEDDINGS[config.word_embedding])  # get pre-trained embeddings

    # Prepare word dictionary.
    word_dict = Dictionary(padding_token=PADDING_TOKEN,
                           unknown_token=UNKNOWN_TOKEN)
    if use_se_marker:
        word_dict.add_all([START_MARKER, END_MARKER])
    if vocab_path != None:
        with open(vocab_path, 'r') as f_vocab:
            for line in f_vocab:
                word_dict.add(line.strip())
            f_vocab.close()
        word_dict.accept_new = False
        print 'Load {} words. Dictionary freezed.'.format(word_dict.size())

    # Parpare label dictionary.
    label_dict = Dictionary()
    if label_path != None:
        with open(label_path, 'r') as f_labels:
            for line in f_labels:
                label_dict.add(line.strip())
            f_labels.close()
        label_dict.set_unknown_token(UNKNOWN_LABEL)
        label_dict.accept_new = False
        print 'Load {} labels. Dictionary freezed.'.format(label_dict.size())

    # Get tokens and labels: [sentence_id, word, predicate, label]
    train_sentences_ids = [sent[0] for sent in raw_train_sents]
    train_tokens = [
        string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings)
        for sent in raw_train_sents
    ]
    train_labels = [
        string_sequence_to_ids(sent[3], label_dict) for sent in raw_train_sents
    ]

    if label_dict.accept_new:
        label_dict.set_unknown_token(
            UNKNOWN_LABEL)  # train corpus contains the label 'O' ?
        label_dict.accept_new = False

    dev_sentences_ids = [sent[0] for sent in raw_dev_sents]
    dev_tokens = [
        string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings)
        for sent in raw_dev_sents
    ]
    dev_labels = [
        string_sequence_to_ids(sent[3], label_dict) for sent in raw_dev_sents
    ]
    print 'Total tokens in Dev dataset {}'.format(
        sum([len(sent[1]) for sent in raw_dev_sents]))
    # Get features
    print 'Extracting features'
    train_features, feature_shapes = features.get_srl_features(
        raw_train_sents, config)
    dev_features, feature_shapes2 = features.get_srl_features(
        raw_dev_sents, config)
    for f1, f2 in zip(feature_shapes, feature_shapes2):
        assert f1 == f2

    # For additional features. Unused now.
    feature_dicts = []
    for feature in config.features:
        feature_dicts.append(None)

    train_sents = []
    dev_sents = []
    for i in range(len(train_tokens)):
        train_sents.append((train_sentences_ids[i], ) + (train_tokens[i], ) +
                           tuple(train_features[i]) + (train_labels[i], ))
    for i in range(len(dev_tokens)):
        dev_sents.append((dev_sentences_ids[i], ) + (dev_tokens[i], ) +
                         tuple(dev_features[i]) + (dev_labels[i], ))

    print("Extraced {} words and {} tags".format(word_dict.size(),
                                                 label_dict.size()))
    print("Max training sentence length: {}".format(
        max([len(s[1]) for s in train_sents])))
    print("Max development sentence length: {}".format(
        max([len(s[1]) for s in dev_sents])))
    word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str]
    word_embedding_shape = [len(word_embedding), len(word_embedding[0])]
    return (train_sents, dev_sents, word_dict, label_dict,
            [word_embedding, None,
             None], [word_embedding_shape] + feature_shapes, [
                 word_dict,
             ] + feature_dicts)