def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False, allow_unk=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags) model.interactive_shell(vocab_tags, processing_word)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim)
def build_data(config): processing_word = get_processing_word() dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_poss, config.poss_filename) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) #test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev]) vocab_glove = get_glove_vocab(config.glove_filename) vocab_glove_uni = get_glove_vocab(config.glove_uni_filename) vocab_feature = get_pos_glove_vocab(config.glove_filename) # vocab = vocab_words & vocab_glove vocab = vocab_glove | vocab_words vocab.add(UNK) vocab.add(NUM) vocab_pos = vocab_feature vocab_pos.add(UNK) vocab_pos.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_glove_uni, config.uni_words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_pos, config.pos_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.t_dim) vocab = load_vocab(config.uni_words_filename) export_trimmed_uni_vectors(vocab, config.NEdic_filename, config.trimmed_dic, config.dic_dim) export_trimmed_uni_vectors(vocab, config.glove_uni_filename, config.uni_trimmed_filename, config.dim) vocab_feature = load_vocab(config.pos_filename) export_trimmed_pos_vectors(vocab_feature, config.glove_feature, config.feature_trimmed_filename, config.pos_dim) # Build and save char vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def main(config): # load vocabs vocab_words, idx2words = load_vocab(config.words_filename) vocab_tags, _ = load_vocab(config.tags_filename) vocab_chars, _ = load_vocab(config.chars_filename) vocab_pos, _ = load_vocab(config.pos_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_pos = get_processing_word(vocab_pos, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename) pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename) NE_dic = get_trimmed_glove_vectors(config.trimmed_dic) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_pos, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_pos, config.max_iter) # build model model = NERModel(config, embeddings, embeddings_uni, pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words, NE_dic=NE_dic) model.build() # train, evaluate and interact if state == "train": model.train(train, dev, vocab_tags) elif state == "evaluate": model.evaluate(dev, vocab_tags) else: #state == predict convert(file) t2o("data_format/test_convert.txt","data_format/test.txt") test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_pos, config.max_iter) model.evaluate(test, vocab_tags) tagging("data_format/test_convert.txt")
def build_data(config, logger): """ Procedure to build data """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators test = CoNLLDataset(config.test_filename, processing_word) dev = CoNLLDataset(config.dev_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab print("Build Word and Tag vocab...") vocab_words, vocab_poss, vocab_chunks, \ vocab_aspect_tags, vocab_polarity_tags, vocab_joint_tags = get_vocabs([train, dev, test]) vocab = vocab_words vocab.add(UNK) vocab.add(NUM) # Save vocab print("Dealing words vocab...") write_vocab(vocab, config.words_filename) print("Dealing poss vocab...") write_vocab(vocab_poss, config.poss_filename) vocab_chunks = [tags for tags in vocab_chunks] if "NO" in vocab_chunks: vocab_chunks.remove("NO") vocab_chunks.insert(0, "NO") else: logger.error(">>> vocab_chunks used as mpqa has something wrong!") print("Dealing chunks vocab...") write_vocab(vocab_chunks, config.chunks_filename) vocab_aspect_tags = [tags for tags in vocab_aspect_tags] vocab_aspect_tags.remove("O") vocab_aspect_tags.insert(0, "O") vocab_polarity_tags = [tags for tags in vocab_polarity_tags] vocab_polarity_tags.remove("O") vocab_polarity_tags.insert(0, "O") vocab_joint_tags = [tags for tags in vocab_joint_tags] vocab_joint_tags.remove("O") vocab_joint_tags.insert(0, "O") print("Dealing aspect_tags vocab...") write_vocab(vocab_aspect_tags, config.aspect_tags_filename) print("Dealing polarity_tags vocab...") write_vocab(vocab_polarity_tags, config.polarity_tags_filename) print("Dealing joint_tags vocab...") write_vocab(vocab_joint_tags, config.joint_tags_filename) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.domain_filename, config.domain_trimmed_filename, config.dim_domain) export_trimmed_glove_vectors(vocab, config.general_filename, config.general_trimmed_filename, config.dim_general)
def build_data(config, logger): """ Procedure to build data """ # Generators processing_word = get_processing_word(lowercase=config.lowercase) test = CoNLLDataset(config.test_filename, processing_word) dev = CoNLLDataset(config.dev_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab logger.info("Build Word and Tag vocab...") vocab_words, vocab_poss, vocab_chunks, vocab_tags = get_vocabs( [train, dev, test]) vocab = vocab_words vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) vocab_tags = [tags for tags in vocab_tags] vocab_tags.remove("O") vocab_tags.insert(0, "O") write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab logger.info("Build chars vocab...") train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename) # Build and save Depstree processing_relation = get_processing_relation() dev_deps = DepsDataset(config.dev_deps_filename, processing_word, processing_relation) train_deps = DepsDataset(config.train_deps_filename, processing_word, processing_relation) logger.info("Build relations vocab...") vocab_relations = get_relations_vocabs([train_deps, dev_deps]) vocab_relations.add(UNK) write_vocab(vocab_relations, config.relations_filename)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_iob = {"O": 0, "B": 1, "I": 2} vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3} # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_iob = get_processing_word(vocab_iob, lowercase=False) processing_type = get_processing_word(vocab_type, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), niob=3, ntype=4) model.build() # train, evaluate and interact print vocab_tags model.train(train, dev, vocab_tags) stime = time.time() model.evaluate(test, vocab_tags) print time.time() - stime
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab.add(PAD) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = CoNLLDataset(config.train_filename, processing_word) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename) # Build and save type vocab vocab_types = set() print len(vocab_tags) for tag in vocab_tags: if tag != 'O': vocab_types.add(tag[2:]) write_vocab(vocab_types, config.types_filename)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_pref_suff = load_vocab( config.PS_filename) ############### For prefix and suffix vocab_pref_suff_2 = load_vocab(config.PS_filename_2) vocab_pref_suff_4 = load_vocab(config.PS_filename_4) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, vocab_pref_suff, vocab_pref_suff_2, vocab_pref_suff_4, lowercase=True, chars=config.chars, Pref_Suff=config.pref_suff) processing_tag = get_processing_word(vocab_tags, lowercase=False, Geoparser=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) ##create dataset dev = CoNLLDataset( config.dev_filename, processing_word, ############ Here dev, test and train have the raw words and tags. Now we have to map these to corresponding word index processing_tag, config.max_iter ) ############ and tags index. Therefore, when we do model.evaluate in below lines, it calls run_evaluate in run_epoch function test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags)
def train(config_path,continue_=False): #reading hyperparameters config_params = json.load(open(config_path)) #loading hyperparams config_data = config(**config_params, load=False) #Creating data vocab.txt, chars.txt, tags.txt, and embeddings data_builder(config_data) #creating loading the data created earlier config_train = config(**config_params,load=True) #build model model = BILSTM_CRF(config_train) model.build() if continue_: try: model_path = config_params["model_path"] print("Loading weights from path:: ",model_path) model.restore_session(model_path) model.reinitialize_weights("proj") print("Restoring weights") except: print("Restoring weights failed") print("training from scratch") print(e) input() #data generators dev = CoNLLDataset(config_train.train, config_train.process_words, config_train.process_tags) train = CoNLLDataset(config_train.test, config_train.process_words, config_train.process_tags) # train model model.train(train, dev) print("Trainig Complete!") print("Remove the events.tf files from the output directory if you don't need them. Note that removing them won't affect the predictions in anyway")
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) dictionary = load_vocab("data/types.txt") types_dic = collections.OrderedDict([(v, k) for k, v in dictionary.items()]) vocab_iob = {"O":0, "B":1, "I":2} vocab_type = load_vocab(config.types_filename) print vocab_type # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_iob = get_processing_word(vocab_iob, lowercase=False) processing_type = get_processing_word(vocab_type, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) ntype = len(vocab_type) model = POSmodel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), niob=3, ntype=ntype) model.build() model.train(train, dev, vocab_type) model.evaluate(test, vocab_type)
def main(pretrained_embeddings_file=None, filtered_embeddings_file="data/filtered_embeddings.txt"): words_file = "data/words.txt" tags_file = "data/tags.txt" chars_file = "data/chars.txt" test_file = 'data/eng.testa' train_file = 'data/eng.train' processing_word = get_processing_word(lowercase=False) test = CoNLLDataset(test_file, processing_word) train = CoNLLDataset(train_file, processing_word) vocab_words, vocab_tags = get_vocabs([train, test]) vocab = set(vocab_words) if pretrained_embeddings_file: embedding_vocab = get_embedding_vocab(pretrained_embeddings_file) vocab &= embedding_vocab print('{} overlapping words'.format(len(vocab))) vocab.add(UNK) vocab.add(NUM) vocab = list(vocab) # TODO: there's probably no need for these anymore, check and remove, if this is the case vocab.insert(TOKEN2IDX[PAD], PAD) vocab.insert(TOKEN2IDX[START_TAG], START_TAG) vocab.insert(TOKEN2IDX[STOP_TAG], STOP_TAG) print(len(vocab)) write_vocab(vocab, words_file) write_vocab(vocab_tags, tags_file) if pretrained_embeddings_file: filter_embeddings_in_vocabulary(words_file, pretrained_embeddings_file, filtered_embeddings_file) vocab_chars = get_char_vocab(vocab_words) write_vocab(vocab_chars, chars_file)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word() # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word)
from model import NERModel from config import Config config = Config() vocab_words = load_vocab(config.words_filename) # words idx vocab_tags = load_vocab(config.tags_filename) # tags idx vocab_chars = load_vocab(config.chars_filename) # char idx processing_word = get_processing_word(vocab_words, vocab_chars) processing_tag = get_processing_word(vocab_tags) embeddings = get_trimmed_glove_vectors(config.trimmed_filename) dev = CoNLLDataset(filename=config.dev_filename, processing_word=processing_word, processing_tag=processing_tag, max_iter=config.max_iter) test = CoNLLDataset(filename=config.test_filename, processing_word=processing_word, processing_tag=processing_tag, max_iter=config.max_iter) train = CoNLLDataset(filename=config.train_filename, processing_word=processing_word, processing_tag=processing_tag, max_iter=config.max_iter) model = NERModel(config=config, embeddings=embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars))
label_score = self.hidden2tag(rnn_out) label_score = self.dropfinal(label_score) return label_score if __name__ == "__main__": from data_utils import Data2tensor, Vocab, seqPAD, CoNLLDataset train_file='/media/data/NER/conll03/conll03/train.bmes' dev_file='/media/data/NER/conll03/conll03/dev.bmes' test_file='/media/data/NER/conll03/conll03/test.bmes' vocab = Vocab(cutoff=1, wl_th=None, cl_th=None, w_lower=False, w_norm=False, c_lower=False, c_norm=False) vocab.build([train_file, dev_file, test_file]) word2idx = vocab.wd2idx(vocab_words=vocab.w2i, vocab_chars=vocab.c2i, allow_unk=True, start_end=True) tag2idx = vocab.tag2idx(vocab_tags=vocab.l2i, start_end=True) train_data = CoNLLDataset(train_file, word2idx=word2idx, tag2idx=tag2idx) train_iters = Vocab.minibatches(train_data, batch_size=10) data=[] label_ids = [] for words, labels in train_iters: char_ids, word_ids = zip(*words) data.append(words) word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=1024, cthres=32) char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=1024, cthres=32) label_ids, label_lengths = seqPAD.pad_sequences(labels, pad_tok=0, wthres=1024, cthres=32) w_tensor=Data2tensor.idx2tensor(word_ids) c_tensor=Data2tensor.idx2tensor(char_ids) y_tensor=Data2tensor.idx2tensor(label_ids) data_tensor = Data2tensor.sort_tensors(label_ids, word_ids, sequence_lengths, char_ids, word_lengths, volatile_flag=False)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev, test]) #pos adding----- vocab_glove = get_glove_vocab(config.glove_filename) vocab_dic = get_dic_vocab(config.dic_filename, 1) #add dic vector get vocab_syl = get_dic_vocab(config.syl_filename, 1) #add syl vector vocab_morph = get_morph_vocab(config.morph_vec_filename) #morph vector get vocab = vocab_words & vocab_glove vocab.add(UNK.decode('utf-8')) vocab.add(NUM.decode('utf-8')) word_dic = vocab_dic #add dic word_dic.add(UNK.decode('utf-8')) word_dic.add(NUM.decode('utf-8')) word_syl = vocab_syl #add syl word_syl.add(UNK.decode('utf-8')) word_syl.add(NUM.decode('utf-8')) word_morph = vocab_morph # add morph word_morph.add(UNK.decode('utf-8')) word_morph.add(NUM.decode('utf-8')) vocab_pos.add(UNK.decode('utf-8')) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(word_dic, config.word_dic_filename) #add dic write_vocab(word_syl, config.word_syl_filename) #add syl write_vocab(word_morph, config.morphs_filename) #add morph write_vocab(vocab_pos, config.posTag_filename) #add pos # Trim GloVe Vectors(pretrain vector) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) word_dic = load_vocab(config.word_dic_filename) #dic add export_dic_vectors(word_dic, config.dic_filename, config.exported_filename, config.dic_dim) word_syl = load_vocab(config.word_syl_filename) #syl add export_syl_vectors(word_syl, config.syl_filename, config.exported_sfilename, config.syl_dim) word_morph = load_vocab(config.morphs_filename) #morph add export_morph_vectors(word_morph, config.morph_vec_filename, config.exported_mfilename, config.dim_morph) vocab_pos = load_vocab(config.posTag_filename) #pos add export_pos_vectors(vocab_pos, config.pos_vec_filename, config.exported_pfilename, config.dim_pos) # Build and save char vocab, morph vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
vocab_relations = load_vocab(config.relations_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=config.lowercase, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_relation = get_processing_relation(vocab_relations) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag=processing_tag) test = CoNLLDataset(config.test_filename, processing_word, processing_tag=processing_tag) train = CoNLLDataset(config.train_filename, processing_word, processing_tag=processing_tag) data = [dev, test, train] _ = map(len, chain.from_iterable(w for w in (s for s in data))) max_sentence_size = max(train.max_words_len, test.max_words_len, dev.max_words_len) max_word_size = max(train.max_chars_len, test.max_chars_len, dev.max_chars_len)
processing_aspect_tag = get_processing_word(vocab_aspect_tags, lowercase=False) processing_polarity_tag = get_processing_word(vocab_polarity_tags, lowercase=False) processing_joint_tag = get_processing_word(vocab_joint_tags, lowercase=False) # get pre trained embeddings domain_embeddings = get_trimmed_glove_vectors( config.domain_trimmed_filename) general_embeddings = get_trimmed_glove_vectors( config.general_trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_pos, processing_chunk, processing_aspect_tag, processing_polarity_tag, processing_joint_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_pos, processing_chunk, processing_aspect_tag, processing_polarity_tag, processing_joint_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_pos, processing_chunk, processing_aspect_tag, processing_polarity_tag, processing_joint_tag, config.max_iter) data = [dev, test, train] _no_use_ = map(len, chain.from_iterable(w for w in (s for s in data))) max_sentence_size = max(train.max_sentence_len, test.max_sentence_len, dev.max_sentence_len)
# training data train_filename = "{}/train.txt".format(data_dir) valid_filename = "{}/valid.txt".format(data_dir) # glove files glove_filename = "{}/glove.6B.{}d.txt".format(glove_dir, dim_word) # trimmed embeddings (created from glove_filename with build_data.py) filename_trimmed = "{}/glove.6B.{}d.trimmed.npz".format(output_dir, dim_word) words_filename = "{}/words.txt".format(output_dir) tags_filename = "{}/tags.txt".format(output_dir) chars_filename = "{}/chars.txt".format(output_dir) processing_word = get_processing_word(lowercase=True) train = CoNLLDataset(train_filename, processing_word) valid = CoNLLDataset(valid_filename, processing_word) # Build word and tag vocabs vocab_words, vocab_tags = get_vocabs([train, valid]) vocab_glove = get_glove_vocab(glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, words_filename) write_vocab(vocab_tags, tags_filename) # Trim GloVe Vectors
dic_flag=config.dic_flag) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_pos = get_processing_word(pos_tags=pos_tags, posflag=True, lowercase=True, pos_lm=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) dic_embeddings = get_exported_dic_vectors(config.exported_filename) morph_embeddings = get_exported_morph_vectors(config.exported_mfilename) syl_embeddings = get_exported_dic_vectors(config.exported_sfilename) pos_embeddings = get_exported_pos_vectors(config.exported_pfilename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_pos, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_pos, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_pos, config.max_iter) # build model lmwords = len(vocab_words) lmposs = len(pos_tags) model = NERModel(config, embeddings, dic_embeddings, pos_embeddings, syl_embeddings, morph_embeddings,
use_chars = True max_iter = None print('Loading vocab files and word vectors from {}'.format(data_dir)) vocab_tags = load_vocab("{}/assets/tags.txt".format(data_dir)) vocab_chars = load_vocab("{}/assets/chars.txt".format(data_dir)) vocab_words = load_vocab("{}/assets/words.txt".format(data_dir)) n_words = len(vocab_words) n_char = len(vocab_chars) n_tags = len(vocab_tags) pad_tag = n_tags n_labels = n_tags + 1 # coNLL data for train train = CoNLLDataset(train_filename, get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=use_chars), get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter) # coNLL data for validation#coNLL valid = CoNLLDataset(valid_filename, get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=use_chars), get_processing_word(vocab_tags, lowercase=False, allow_unk=False), max_iter) emb_data = np.load("{}/assets/glove.6B.300d.trimmed.npz".format(data_dir)) embeddings = emb_data["embeddings"] # Hyperparameters dim_word = 300 dim_char = 100 hidden_size_char = 100 # lstm on chars hidden_size_lstm = 300 # lstm on word embeddings nepochs = args.epochs lr = 0.0105 lr_decay = 0.0005