def load_and_process_data(self, train_path, dev_path=None): # load train and dev data logger.info("Loading training data...") train_raw = read_conll(train_path) logger.info("Done. Read %d sentences", len(train_raw)) logger.info("Loading dev data...") if dev_path is None or not os.path.exists(dev_path): # samples from train in tail to dev dev_seg_size = self.config.dev_seg_size dev_raw = train_raw[len(train_raw) - dev_seg_size:] train_raw = train_raw[:len(train_raw) - dev_seg_size] logger.info("Divided train data. Read %d sentences of train", len(train_raw)) else: dev_raw = read_conll(dev_path) logger.info("Done. Read %d sentences", len(dev_raw)) helper = ModelHelper.build(train_raw, self.config) logger.info("Corpus of train max sentence length is %d", helper.max_length) # process all the input data train_data = helper.vectorize(train_raw, self.config) dev_data = helper.vectorize(dev_raw, self.config) return helper, train_data, dev_data, train_raw, dev_raw
def load_and_preprocess_data(args): logger.info("Loading training data...") train = read_conll(args.data_train) logger.info("Done. Read %d sentences", len(train)) logger.info("Loading dev data...") dev = read_conll(args.data_dev) logger.info("Done. Read %d sentences", len(dev)) helper = ModelHelper.build(train) # now process all the input data. train_data = helper.vectorize(train) dev_data = helper.vectorize(dev) return helper, train_data, dev_data, train, dev
def load_and_preprocess_data(args): logger.info("Loading training data...") train = read_conll(args.data_train) logger.info("Done. Read %d sentences", len(train)) logger.info("Loading dev data...") dev = read_conll(args.data_dev) logger.info("Done. Read %d sentences", len(dev)) helper = ModelHelper.build(train) # now process all the input data. train_data = helper.vectorize(train) dev_data = helper.vectorize(dev) return helper, train_data, dev_data, train, dev
def do_evaluate(args): config = Config(args) helper = ModelHelper.load(args.model_path) input_data = read_conll(args.data) embeddings = load_embeddings(args, helper) config.embed_size = embeddings.shape[1] with tf.Graph().as_default(): logger.info("Building model...",) start = time.time() model = RNNModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) saver.restore(session, model.config.model_output) probs = None for output in model.output(session, input_data, summarize=args.verbose): if args.verbose: (sentence, labels, predictions, probs) = output else: (sentence, labels, predictions) = output predictions = [LBLS[l] for l in predictions] print_sentence(args.output, sentence, labels, predictions, probs)
def load_and_preprocess_data(args): logger.info("Loading training data...") train = read_conll(args.data_train) # [([words of a sentence],[labels])] logger.info("Done. Read %d sentences", len(train)) logger.info("Loading dev data...") dev = read_conll(args.data_dev) logger.info("Done. Read %d sentences", len(dev)) helper = ModelHelper.build(train) # 同时也初始化了ModelHelper,诡异的初始化方式... # now process all the input data. train_data = helper.vectorize( train) # [ ( [ [wordid,caseid] ], [label_ids] ) ] dev_data = helper.vectorize(dev) return helper, train_data, dev_data, train, dev
def load_and_preprocess_data(args): logger.info("Loading training data...") train = read_conll(args.data_train) # ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'] ['ORG', 'O', 'MISC', 'O', 'O', 'O', 'MISC', 'O', 'O'] # ['Peter', 'Blackburn'] ['PER', 'PER'] # ['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.'] logger.info("Done. Read %d sentences", len(train)) logger.info("Loading dev data...") dev = read_conll(args.data_dev) logger.info("Done. Read %d sentences", len(dev)) helper = ModelHelper.build(train) # now process all the input data. train_data = helper.vectorize(train) dev_data = helper.vectorize(dev) """ Let see an example: args.data_train: file tiny.conll train = train[0:5] The result of train [(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['ORG', 'O', 'MISC', 'O', 'O', 'O', 'MISC', 'O', 'O']), (['Peter', 'Blackburn'], ['PER', 'PER']), (['BRUSSELS', '1996-08-22'], ['LOC', 'O']), (['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.'], ['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'MISC', 'O', 'O', 'O', 'O', 'O', 'MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']), (['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.'], ['LOC', 'O', 'O', 'O', 'O', 'ORG', 'ORG', 'O', 'O', 'O', 'PER', 'PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])] The result of helper.tok2id {'CASE:aA': 56, 'CASE:AA': 57, '1996-08-22': 14, 'committee': 15, 'be': 38, 'german': 4, 'it': 16, 'boycott': 17, 'britain': 18, 'werner': 19, 'determine': 20, 'UUUNKKK': 62, 'eu': 21, 'peter': 22, '</s>': 61, 'lamb': 5, 'disagreed': 24, 'said': 6, 'from': 25, 'sheepmeat': 26, 'consumers': 7, 'rejects': 27, 'union': 28, 'veterinary': 29, 'thursday': 30, '.': 2, 'zwingmann': 31, 'to': 1, 'other': 33, 'call': 34, 'scientists': 35, 'was': 36, 'until': 8, 'european': 9, 'CASE:aa': 59, 'sheep': 23, 'buy': 39, "'s": 10, 'scientific': 37, 'advice': 11, 'clearer': 40, 'wednesday': 41, 'germany': 42, 'mad': 43, 'shun': 44, 'brussels': 45, 'with': 46, 'than': 47, 'on': 12, 'disease': 53, 'cow': 48, 'blackburn': 49, 'whether': 50, 'should': 51, 'countries': 52, 'commission': 32, 'british': 13, 'CASE:Aa': 58, '<s>': 60, 'transmitted': 54, 'can': 55, 'the': 3, 'representative': 56} The result of train_data [([[21, 57], [27, 59], [4, 58], [34, 59], [1, 59], [17, 59], [13, 58], [5, 59], [2, 56]], [1, 4, 3, 4, 4, 4, 3, 4, 4]), ([[22, 58], [49, 58]], [0, 0]), ([[45, 57], [14, 56]], [2, 4]), ([[3, 58], [9, 58], [32, 58], [6, 59], [12, 59], [30, 58], [16, 59], [24, 59], [46, 59], [4, 58], [11, 59], [1, 59], [7, 59], [1, 59], [44, 59], [13, 58], [5, 59], [8, 59], [35, 59], [20, 59], [50, 59], [43, 59], [48, 59], [53, 59], [55, 59], [38, 59], [54, 59], [1, 59], [23, 59], [2, 56]], [4, 1, 1, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]), ([[42, 58], [10, 59], [56, 59], [1, 59], [3, 59], [9, 58], [28, 58], [10, 59], [29, 59], [15, 59], [19, 58], [31, 58], [6, 59], [12, 59], [41, 58], [7, 59], [51, 59], [39, 59], [26, 59], [25, 59], [52, 59], [33, 59], [47, 59], [18, 58], [8, 59], [3, 59], [37, 59], [11, 59], [36, 59], [40, 59], [2, 56]], [2, 4, 4, 4, 4, 1, 1, 4, 4, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4])] """ return helper, train_data, dev_data, train, dev
def load_and_preprocess_data(args): logger.info("Loading training data...") train = read_conll(args.data_train) #print("********************Writing train master data to a file:**********************") #with open("F:/Jupyter/NLP/assignment3/train.txt","w") as f: #f.write(str(train)) logger.info("Done. Read %d sentences", len(train)) logger.info("Loading dev data...") dev = read_conll(args.data_dev) logger.info("Done. Read %d sentences", len(dev)) helper = ModelHelper.build(train) # now process all the input data. train_data = helper.vectorize(train) #print("***********************Writing vectorized train data*********************") #with open("F:/Jupyter/NLP/assignment3/train_data.txt","w") as f: #f.write(str(train_data)) dev_data = helper.vectorize(dev) return helper, train_data, dev_data, train, dev
def load_data(args, helper=None): train_examples, dev_examples, data_examples = {}, {}, {} if hasattr(args, 'data_train'): logger.info("Loading training data...") train_examples['tokens'] = read_conll(args.data_train) logger.info("Done. Read %d sentences", len(train_examples['tokens'])) if hasattr(args, 'data_dev'): logger.info("Loading dev data...") dev_examples['tokens'] = read_conll(args.data_dev) logger.info("Done. Read %d sentences", len(dev_examples['tokens'])) if hasattr(args, 'data'): logger.info("Loading data...") data_examples['tokens'] = read_conll(args.data) logger.info("Done. Read %d sentences", len(data_examples['tokens'])) if hasattr(args, 'data_train') and helper is None: helper = ModelHelper.build(train_examples['tokens']) helper.max_length = max( max(len(sentence) for sentence, _ in dev_examples['tokens']), helper.max_length) # now process all the input data. if hasattr(args, 'data_train'): train_examples['token_indices'] = helper.vectorize( train_examples['tokens']) if hasattr(args, 'data_dev'): dev_examples['token_indices'] = helper.vectorize( dev_examples['tokens']) if hasattr(args, 'data'): data_examples['token_indices'] = helper.vectorize( data_examples['tokens']) data = { 'train_examples': train_examples, 'dev_examples': dev_examples, 'examples': data_examples } return helper, data
def do_evaluate(args): config = Config(args) helper = ModelHelper.load(args.model_path) input_data = read_conll(args.data) embeddings = load_embeddings(args, helper) config.embed_size = embeddings.shape[1] logger.info("Building model...",) start = time.time() model = RNNModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) for sentence, labels, predictions in model.output(input_data): predictions = [LBLS[l] for l in predictions] print_sentence(args.output, sentence, labels, predictions)
def do_evaluate(args): config = Config(args) helper = ModelHelper.load(args.model_path) input_data = read_conll(args.data) embeddings = load_embeddings(args, helper) config.embed_size = embeddings.shape[1] with tf.Graph().as_default(): logger.info("Building model...",) start = time.time() model = RNNModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) saver.restore(session, model.config.model_output) for sentence, labels, predictions in model.output(session, input_data): predictions = [LBLS[l] for l in predictions] print_sentence(args.output, sentence, labels, predictions)