def create_dictionary(argv): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input-directory", type=str) parser.add_argument("-o", "--output-file", type=str) args = parser.parse_args(argv) sentences = load_directory_sentences(args.input_directory) dictionary = Dictionary() for sentence in sentences: dictionary.add_all(sentence.split(" ")) dictionary.save(args.output_file)
def get_postag_data(config, train_path, dev_path, vocab_path=None, label_path=None): use_se_marker = config.use_se_marker raw_train_sents = get_sentences(train_path, use_se_marker) raw_dev_sents = get_sentences(dev_path, use_se_marker) word_to_embeddings = get_pretrained_embeddings( WORD_EMBEDDINGS[config.word_embedding]) # Prepare word dictionary. word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN) if use_se_marker: word_dict.add_all([START_MARKER, END_MARKER]) if vocab_path != None: with open(vocab_path, 'r') as f_vocab: for line in f_vocab: word_dict.add(line.strip()) f_vocab.close() word_dict.accept_new = False print 'Load {} words. Dictionary freezed.'.format(word_dict.size()) # Parpare label dictionary. label_dict = Dictionary() if label_path != None: with open(label_path, 'r') as f_labels: for line in f_labels: label_dict.add(line.strip()) f_labels.close() label_dict.set_unknown_token(UNKNOWN_LABEL) label_dict.accept_new = False print 'Load {} labels. Dictionary freezed.'.format(label_dict.size()) train_sents = [(string_sequence_to_ids(sent[0], word_dict, True, word_to_embeddings), string_sequence_to_ids(sent[1], label_dict)) for sent in raw_train_sents] dev_sents = [(string_sequence_to_ids(sent[0], word_dict, True, word_to_embeddings), string_sequence_to_ids(sent[1], label_dict)) for sent in raw_dev_sents] print("Extracted {} words and {} tags".format(word_dict.size(), label_dict.size())) print("Max training sentence length: {}".format( max([len(s[0]) for s in train_sents]))) print("Max development sentence length: {}".format( max([len(s[0]) for s in dev_sents]))) word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str] word_embedding_shape = [len(word_embedding), len(word_embedding[0])] return (train_sents, dev_sents, word_dict, label_dict, [word_embedding], [word_embedding_shape])
def get_srl_data(config, train_data_path, dev_data_path, vocab_path=None, label_path=None): ''' ''' use_se_marker = config.use_se_marker raw_train_sents = get_srl_sentences(train_data_path, use_se_marker) raw_dev_sents = get_srl_sentences(dev_data_path, use_se_marker) word_to_embeddings = get_pretrained_embeddings( WORD_EMBEDDINGS[config.word_embedding]) # get pre-trained embeddings # Prepare word dictionary. word_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) if use_se_marker: word_dict.add_all([START_MARKER, END_MARKER]) if vocab_path != None: with open(vocab_path, 'r') as f_vocab: for line in f_vocab: word_dict.add(line.strip()) f_vocab.close() word_dict.accept_new = False print 'Load {} words. Dictionary freezed.'.format(word_dict.size()) # Parpare label dictionary. label_dict = Dictionary() if label_path != None: with open(label_path, 'r') as f_labels: for line in f_labels: label_dict.add(line.strip()) f_labels.close() label_dict.set_unknown_token(UNKNOWN_LABEL) label_dict.accept_new = False print 'Load {} labels. Dictionary freezed.'.format(label_dict.size()) # Get tokens and labels: [sentence_id, word, predicate, label] train_sentences_ids = [sent[0] for sent in raw_train_sents] train_tokens = [ string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings) for sent in raw_train_sents ] train_labels = [ string_sequence_to_ids(sent[3], label_dict) for sent in raw_train_sents ] if label_dict.accept_new: label_dict.set_unknown_token( UNKNOWN_LABEL) # train corpus contains the label 'O' ? label_dict.accept_new = False dev_sentences_ids = [sent[0] for sent in raw_dev_sents] dev_tokens = [ string_sequence_to_ids(sent[1], word_dict, True, word_to_embeddings) for sent in raw_dev_sents ] dev_labels = [ string_sequence_to_ids(sent[3], label_dict) for sent in raw_dev_sents ] print 'Total tokens in Dev dataset {}'.format( sum([len(sent[1]) for sent in raw_dev_sents])) # Get features print 'Extracting features' train_features, feature_shapes = features.get_srl_features( raw_train_sents, config) dev_features, feature_shapes2 = features.get_srl_features( raw_dev_sents, config) for f1, f2 in zip(feature_shapes, feature_shapes2): assert f1 == f2 # For additional features. Unused now. feature_dicts = [] for feature in config.features: feature_dicts.append(None) train_sents = [] dev_sents = [] for i in range(len(train_tokens)): train_sents.append((train_sentences_ids[i], ) + (train_tokens[i], ) + tuple(train_features[i]) + (train_labels[i], )) for i in range(len(dev_tokens)): dev_sents.append((dev_sentences_ids[i], ) + (dev_tokens[i], ) + tuple(dev_features[i]) + (dev_labels[i], )) print("Extraced {} words and {} tags".format(word_dict.size(), label_dict.size())) print("Max training sentence length: {}".format( max([len(s[1]) for s in train_sents]))) print("Max development sentence length: {}".format( max([len(s[1]) for s in dev_sents]))) word_embedding = [word_to_embeddings[w] for w in word_dict.idx2str] word_embedding_shape = [len(word_embedding), len(word_embedding[0])] return (train_sents, dev_sents, word_dict, label_dict, [word_embedding, None, None], [word_embedding_shape] + feature_shapes, [ word_dict, ] + feature_dicts)