def get_postag_data(config, train_path, dev_path, vocab_path=None, label_path=None): use_se_marker = config.use_se_marker raw_train_sents = get_sentences(train_path, use_se_marker) raw_dev_sents = get_sentences(dev_path, use_se_marker) word_to_embeddings = get_pretrained_embeddings( WORD_EMBEDDINGS[config.word_embedding]) # Prepare word dictionary. word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN) if use_se_marker: word_dict.add_all([START_MARKER, END_MARKER]) if vocab_path != None: with open(vocab_path, 'r') as f_vocab: for line in f_vocab: word_dict.add(line.strip()) f_vocab.close() word_dict.accept_new = False print 'Load {} words. Dictionary freezed.'.format(word_dict.size()) # Parpare label dictionary. label_dict = Dictionary() if label_path != None: with open(label_path, 'r') as f_labels: for line in f_labels: label_dict.add(line.strip()) f_labels.close() label_dict.set_unknown_token(UNKNOWN_LABEL) label_dict.accept_new = False print 'Load {} labels. Dictionary freezed.'.format(label_dict.size()) train_sents = [(string_sequence_to_ids(sent[0], word_dict, True, word_to_embeddings), string_sequence_to_ids(sent[1], label_dict)) for sent in raw_train_sents] dev_sents = [(string_sequence_to_ids(sent[0], word_dict, True, word_to_embeddings), string_sequence_to_ids(sent[1], label_dict)) for sent in raw_dev_sents] print("Extracted {} words and {} tags".format(word_dict.size(), label_dict.size())) print("Max training sentence length: {}".format( max([len(s[0]) for s in train_sents]))) print("Max development sentence length: {}".format( max([len(s[0]) for s in dev_sents]))) word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str] word_embedding_shape = [len(word_embedding), len(word_embedding[0])] return (train_sents, dev_sents, word_dict, label_dict, [word_embedding], [word_embedding_shape])
def get_srl_data(config, train_data_path, dev_data_path, vocab_path=None, label_path=None): ''' ''' use_se_marker = config.use_se_marker raw_train_sents = get_srl_sentences(train_data_path, use_se_marker) raw_dev_sents = get_srl_sentences(dev_data_path, use_se_marker) word_to_embeddings = get_pretrained_embeddings( WORD_EMBEDDINGS[config.word_embedding]) # get pre-trained embeddings # Prepare word dictionary. word_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) if use_se_marker: word_dict.add_all([START_MARKER, END_MARKER]) if vocab_path != None: with open(vocab_path, 'r') as f_vocab: for line in f_vocab: word_dict.add(line.strip()) f_vocab.close() word_dict.accept_new = False print 'Load {} words. Dictionary freezed.'.format(word_dict.size()) # Parpare label dictionary. label_dict = Dictionary() if label_path != None: with open(label_path, 'r') as f_labels: for line in f_labels: label_dict.add(line.strip()) f_labels.close() label_dict.set_unknown_token(UNKNOWN_LABEL) label_dict.accept_new = False print 'Load {} labels. Dictionary freezed.'.format(label_dict.size()) # Get tokens and labels: [sentence_id, word, predicate, label] train_sentences_ids = [sent[0] for sent in raw_train_sents] train_tokens = [ string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings) for sent in raw_train_sents ] train_labels = [ string_sequence_to_ids(sent[3], label_dict) for sent in raw_train_sents ] if label_dict.accept_new: label_dict.set_unknown_token( UNKNOWN_LABEL) # train corpus contains the label 'O' ? label_dict.accept_new = False dev_sentences_ids = [sent[0] for sent in raw_dev_sents] dev_tokens = [ string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings) for sent in raw_dev_sents ] dev_labels = [ string_sequence_to_ids(sent[3], label_dict) for sent in raw_dev_sents ] print 'Total tokens in Dev dataset {}'.format( sum([len(sent[1]) for sent in raw_dev_sents])) # Get features print 'Extracting features' train_features, feature_shapes = features.get_srl_features( raw_train_sents, config) dev_features, feature_shapes2 = features.get_srl_features( raw_dev_sents, config) for f1, f2 in zip(feature_shapes, feature_shapes2): assert f1 == f2 # For additional features. Unused now. feature_dicts = [] for feature in config.features: feature_dicts.append(None) train_sents = [] dev_sents = [] for i in range(len(train_tokens)): train_sents.append((train_sentences_ids[i], ) + (train_tokens[i], ) + tuple(train_features[i]) + (train_labels[i], )) for i in range(len(dev_tokens)): dev_sents.append((dev_sentences_ids[i], ) + (dev_tokens[i], ) + tuple(dev_features[i]) + (dev_labels[i], )) print("Extraced {} words and {} tags".format(word_dict.size(), label_dict.size())) print("Max training sentence length: {}".format( max([len(s[1]) for s in train_sents]))) print("Max development sentence length: {}".format( max([len(s[1]) for s in dev_sents]))) word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str] word_embedding_shape = [len(word_embedding), len(word_embedding[0])] return (train_sents, dev_sents, word_dict, label_dict, [word_embedding, None, None], [word_embedding_shape] + feature_shapes, [ word_dict, ] + feature_dicts)
def get_srl_data(config, train_data_path, dep_path, dev_data_path, vocab_path=None, char_path=None, label_path=None): # Load sentences (documents) from data paths respectively. raw_train_sents = get_srl_sentences(train_data_path) raw_dev_sents = get_srl_sentences(dev_data_path) # Load dev data eval_data = load_eval_data(dev_data_path) # Load pretrained embeddings word_embeddings = get_pretrained_embeddings( config.word_embedding) # get pre-trained embeddings head_embeddings = get_pretrained_embeddings(config.head_embedding) # Prepare word embedding dictionary. word_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) # Prepare head embedding dictionary. head_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) # Prepare char dictionary. char_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) with open(char_path, 'r') as f_char: for line in f_char: char_dict.add(line.strip()) f_char.close() char_dict.accept_new = False print 'Load {} chars, Dictionary freezed.'.format(char_dict.size()) # Parpare SRL label dictionary. label_dict = Dictionary() label_dict.set_unknown_token( NULL_LABEL) # train corpus contains the label 'O' ? if label_path is not None: with open(label_path, 'r') as f_labels: for line in f_labels: label_dict.add(line.strip()) f_labels.close() label_dict.set_unknown_token(NULL_LABEL) label_dict.accept_new = False print 'Load {} labels. Dictionary freezed.'.format(label_dict.size()) # Parpare SRL label dictionary. dep_label_dict = Dictionary() # Training data: Get tokens and labels: [sentence_id, word, predicate, label] train_samples = tokenize_data(raw_train_sents, word_dict, head_dict, char_dict, label_dict, False, word_embeddings, head_embeddings) # Data for dep Trees with Timer("Loading Dependency Trees"): dep_trees = SyntacticCONLL() dep_trees.read_from_file(dep_path, prune_ratio=config.dep_prune_ratio) dep_trees.tokenize_dep_trees(word_dict, char_dict, dep_label_dict, word_embeddings) # set dictionary freezed char_dict.accept_new, label_dict.accept_new, dep_label_dict.accept_new = False, False, False # Development data: dev_samples = tokenize_data(raw_dev_sents, word_dict, head_dict, char_dict, label_dict, False, word_embeddings, head_embeddings) # set word and head dict freezed. word_dict.accept_new, head_dict.accept_new = False, False print("Extract {} words and {} tags".format(word_dict.size(), label_dict.size())) print("Max training sentence length: {}".format( max([s[1] for s in train_samples]))) print("Max development sentence length: {}".format( max([s[1] for s in dev_samples]))) word_embedding = np.asarray( [word_embeddings[w] for w in word_dict.idx2str]) word_embedding_shape = [len(word_embedding), len(word_embedding[0])] head_embedding = np.asarray( [head_embeddings[w] for w in head_dict.idx2str]) head_embedding_shape = [len(head_embedding), len(head_embedding[0])] print("word embedding shape {}, head embedding shape {}".format( word_embedding_shape, head_embedding_shape)) return (train_samples, dev_samples, dep_trees.sample_dep_data, eval_data, word_dict, head_dict, char_dict, label_dict, dep_label_dict, [word_embedding, head_embedding], [word_embedding_shape, head_embedding_shape])