def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) #test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev]) vocab_glove = get_glove_vocab(config.glove_filename) vocab_glove_uni = get_glove_vocab(config.glove_uni_filename) vocab_feature = get_pos_glove_vocab(config.glove_filename) # vocab = vocab_words & vocab_glove vocab = vocab_glove | vocab_words vocab.add(UNK) vocab.add(NUM) vocab_pos = vocab_feature vocab_pos.add(UNK) vocab_pos.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_glove_uni, config.uni_words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_pos, config.pos_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.t_dim) vocab = load_vocab(config.uni_words_filename) export_trimmed_uni_vectors(vocab, config.NEdic_filename, config.trimmed_dic, config.dic_dim) export_trimmed_uni_vectors(vocab, config.glove_uni_filename, config.uni_trimmed_filename, config.dim) vocab_feature = load_vocab(config.pos_filename) export_trimmed_pos_vectors(vocab_feature, config.glove_feature, config.feature_trimmed_filename, config.pos_dim) # Build and save char vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def build_data(config): processing_word = get_processing_word() dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_poss, config.poss_filename) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def get_conll2005_vocab(config, processing_word, embedding_vocab): print("*** CoNLL-2005 vocabulary ***") # Generators train = CoNLL2005Dataset(config.filename_train, processing_word) dev = CoNLL2005Dataset(config.filename_dev, processing_word) test_wsj = CoNLL2005Dataset(config.filename_test_wsj, processing_word) test_brown = CoNLL2005Dataset(config.filename_test_brown, processing_word) vocab_all_words, vocab_tags = get_vocabs( [train, dev, test_wsj, test_brown]) vocab_words = vocab_all_words & embedding_vocab unk_words = vocab_all_words - embedding_vocab vocab_words = _add_vocab_constants(vocab_words) # Re-create Generators for char, with no processing train = CoNLL2005Dataset(config.filename_train) dev = CoNLL2005Dataset(config.filename_dev) test_wsj = CoNLL2005Dataset(config.filename_test_wsj) test_brown = CoNLL2005Dataset(config.filename_test_brown) vocab_chars = get_char_vocab([train, dev, test_wsj, test_brown]) # Save vocab write_vocab(vocab_chars, config.filename_chars) write_vocab(vocab_words, config.filename_words) write_vocab(unk_words, "{}.unknown".format(config.filename_words)) write_vocab(vocab_tags, config.filename_tags) return vocab_words, vocab_chars
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) processing_word = get_processing_word(lowercase=True) # clean data train_filepath, dev_filepath_a = write_clear_data( config.train_filename, build_dev=config.build_dev_from_trainset, dev_ratio=config.dev_ratio) test_filepath, dev_filepath_b = write_clear_data( config.test_filename, build_dev=config.build_dev_from_testset, dev_ratio=config.dev_ratio) dev_filepath = dev_filepath_a or dev_filepath_b # Generators dev = Dataset(dev_filepath, processing_word) test = Dataset(test_filepath, processing_word) train = Dataset(train_filepath, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = Dataset(train_filepath) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def build_data(config, logger): """ Procedure to build data """ # Generators processing_word = get_processing_word(lowercase=config.lowercase) test = CoNLLDataset(config.test_filename, processing_word) dev = CoNLLDataset(config.dev_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab logger.info("Build Word and Tag vocab...") vocab_words, vocab_poss, vocab_chunks, vocab_tags = get_vocabs( [train, dev, test]) vocab = vocab_words vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) vocab_tags = [tags for tags in vocab_tags] vocab_tags.remove("O") vocab_tags.insert(0, "O") write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab logger.info("Build chars vocab...") train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename) # Build and save Depstree processing_relation = get_processing_relation() dev_deps = DepsDataset(config.dev_deps_filename, processing_word, processing_relation) train_deps = DepsDataset(config.train_deps_filename, processing_word, processing_relation) logger.info("Build relations vocab...") vocab_relations = get_relations_vocabs([train_deps, dev_deps]) vocab_relations.add(UNK) write_vocab(vocab_relations, config.relations_filename)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab.add(PAD) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = CoNLLDataset(config.train_filename, processing_word) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename) # Build and save type vocab vocab_types = set() print len(vocab_tags) for tag in vocab_tags: if tag != 'O': vocab_types.add(tag[2:]) write_vocab(vocab_types, config.types_filename)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = Data(config.dev_filename, processing_word) test = Data(config.test_filename, processing_word) train = Data(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_mor_tags, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab_mor_tags.add(UNK) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_mor_tags, config.mor_tags_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = Data(config.train_filename) # should change this ~!! vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def get_semcor_vocab(config, processing_word, embedding_vocab): print("*** Semcor vocabulary ***") # Iterator raw = SemcorDataset(config.filename_raw, processing_word) # Word Vocab vocab_all_words, vocab_tags = get_vocabs([raw]) vocab_words = vocab_all_words & embedding_vocab unk_words = vocab_all_words - embedding_vocab vocab_words = _add_vocab_constants(vocab_words) # Char Vocab raw = SemcorDataset(config.filename_raw) vocab_chars = get_char_vocab([raw]) # Save vocab write_vocab(vocab_chars, config.filename_chars) write_vocab(vocab_words, config.filename_words) write_vocab(unk_words, "{}.unknown".format(config.filename_words)) write_vocab(vocab_tags, config.filename_tags) return vocab_words, vocab_chars
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def main(pretrained_embeddings_file=None, filtered_embeddings_file="data/filtered_embeddings.txt"): words_file = "data/words.txt" tags_file = "data/tags.txt" chars_file = "data/chars.txt" test_file = 'data/eng.testa' train_file = 'data/eng.train' processing_word = get_processing_word(lowercase=False) test = CoNLLDataset(test_file, processing_word) train = CoNLLDataset(train_file, processing_word) vocab_words, vocab_tags = get_vocabs([train, test]) vocab = set(vocab_words) if pretrained_embeddings_file: embedding_vocab = get_embedding_vocab(pretrained_embeddings_file) vocab &= embedding_vocab print('{} overlapping words'.format(len(vocab))) vocab.add(UNK) vocab.add(NUM) vocab = list(vocab) # TODO: there's probably no need for these anymore, check and remove, if this is the case vocab.insert(TOKEN2IDX[PAD], PAD) vocab.insert(TOKEN2IDX[START_TAG], START_TAG) vocab.insert(TOKEN2IDX[STOP_TAG], STOP_TAG) print(len(vocab)) write_vocab(vocab, words_file) write_vocab(vocab_tags, tags_file) if pretrained_embeddings_file: filter_embeddings_in_vocabulary(words_file, pretrained_embeddings_file, filtered_embeddings_file) vocab_chars = get_char_vocab(vocab_words) write_vocab(vocab_chars, chars_file)
def get_conll2003_vocab(config, processing_word, embedding_vocab): print("*** CoNLL-2003 vocabulary ***") # Iterators dev = CoNLL2003Dataset(config.filename_dev) test = CoNLL2003Dataset(config.filename_test) train = CoNLL2003Dataset(config.filename_train) vocab_all_words, vocab_tags = get_vocabs([train, dev, test]) vocab_words = vocab_all_words & embedding_vocab unk_words = vocab_all_words - embedding_vocab vocab_words = _add_vocab_constants(vocab_words) dev = CoNLL2003Dataset(config.filename_dev) test = CoNLL2003Dataset(config.filename_test) train = CoNLL2003Dataset(config.filename_train) vocab_chars = get_char_vocab([train, dev, test]) # Save vocab write_vocab(vocab_chars, config.filename_chars) write_vocab(vocab_words, config.filename_words) write_vocab(unk_words, "{}.unknown".format(config.filename_words)) write_vocab(vocab_tags, config.filename_tags) return vocab_words, vocab_chars
def build_data(config): annotations = [] meta_filename = 'sw%s%s-ms98-a-trans.text' # % (file_id, speaker_id) for idx in os.listdir(config.wimp_corpus): idx_path = os.path.join(config.wimp_corpus, idx) if os.path.isfile(idx_path): continue for file_id in os.listdir(idx_path): folder = os.path.join(idx_path, file_id) if os.path.isfile(folder): continue wimp_trans_files = [ os.path.join(folder, meta_filename % (file_id, 'A')), os.path.join(folder, meta_filename % (file_id, 'B')) ] swd_trans_files = [ os.path.join(config.swd_transcripts, idx, file_id, meta_filename % (file_id, 'A')), os.path.join(config.swd_transcripts, idx, file_id, meta_filename % (file_id, 'B')) ] for i, wimp_trans_file in enumerate(wimp_trans_files): swd_trans_file = swd_trans_files[i] file_id, speaker = swd_trans_file.split("/")[-2:] speaker = speaker[6] with open(wimp_trans_file) as w_file_obj, open( swd_trans_file) as s_file_obj: for line_num, (anns_, wrds_) in enumerate( zip(w_file_obj, s_file_obj)): sentence = [] anns = anns_.strip().split(' ')[3:] wrds = wrds_.strip().split(' ')[3:] assert(len(anns) == len(wrds)), \ "file mismatch, line %d : %s and %s" % (line_num, swd_trans_file, wimp_trans_file) for id_, wrd in enumerate(wrds): wrd = clean_word(wrd) if wrd != '': sentence.append([(file_id, line_num, speaker), wrd, float(anns[id_])]) if len(sentence) != 0: annotations.append(sentence) random.shuffle(annotations) #80% for training, 10% dev, 10% test d_train = annotations[:0.8 * len(annotations)] d_test = annotations[0.8 * len(annotations):0.9 * len(annotations)] d_dev = annotations[0.9 * len(annotations):] def prep_text_data(D, outfile): with open(outfile, 'w') as f: for sent in D: for _, word, label in sent: f.write("%s %f\n" % (word, label)) f.write("\n") prep_text_data(d_train, config.train_filename) prep_text_data(d_test, config.test_filename) prep_text_data(d_dev, config.dev_filename) processing_word = get_processing_word(lowercase=True) # Generators dev = AnnotationDataset(config.dev_filename, processing_word) test = AnnotationDataset(config.test_filename, processing_word) train = AnnotationDataset(config.train_filename, processing_word) # Build Word and Tag vocab # Vocabulary is built using training data vocab_words, vocab_tags = get_vocabs([train]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = AnnotationDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev, test]) #pos adding----- vocab_glove = get_glove_vocab(config.glove_filename) vocab_dic = get_dic_vocab(config.dic_filename, 1) #add dic vector get vocab_syl = get_dic_vocab(config.syl_filename, 1) #add syl vector vocab_morph = get_morph_vocab(config.morph_vec_filename) #morph vector get vocab = vocab_words & vocab_glove vocab.add(UNK.decode('utf-8')) vocab.add(NUM.decode('utf-8')) word_dic = vocab_dic #add dic word_dic.add(UNK.decode('utf-8')) word_dic.add(NUM.decode('utf-8')) word_syl = vocab_syl #add syl word_syl.add(UNK.decode('utf-8')) word_syl.add(NUM.decode('utf-8')) word_morph = vocab_morph # add morph word_morph.add(UNK.decode('utf-8')) word_morph.add(NUM.decode('utf-8')) vocab_pos.add(UNK.decode('utf-8')) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(word_dic, config.word_dic_filename) #add dic write_vocab(word_syl, config.word_syl_filename) #add syl write_vocab(word_morph, config.morphs_filename) #add morph write_vocab(vocab_pos, config.posTag_filename) #add pos # Trim GloVe Vectors(pretrain vector) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) word_dic = load_vocab(config.word_dic_filename) #dic add export_dic_vectors(word_dic, config.dic_filename, config.exported_filename, config.dic_dim) word_syl = load_vocab(config.word_syl_filename) #syl add export_syl_vectors(word_syl, config.syl_filename, config.exported_sfilename, config.syl_dim) word_morph = load_vocab(config.morphs_filename) #morph add export_morph_vectors(word_morph, config.morph_vec_filename, config.exported_mfilename, config.dim_morph) vocab_pos = load_vocab(config.posTag_filename) #pos add export_pos_vectors(vocab_pos, config.pos_vec_filename, config.exported_pfilename, config.dim_pos) # Build and save char vocab, morph vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
config = Config() processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(config.UNK) vocab.add(config.NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)