def trainNetwork(tml_files, gold_files, newsreader_dir): ''' train::trainNetwork() Purpose: Train a neural network for classification of temporal realtions. Assumes events and timexes will be provided at prediction time @param tml_files: List of unlabled (no timex, etc) timeML documents @param gold_files: Fully labeled gold standard timeML documents ''' from code.learning import network print "Called trainNetwork" global timenote_imported # Read in notes notes = [] basename = lambda x: os.path.basename(x[0:x.index(".tml")]) pickled_timeml_notes = [os.path.basename(l) for l in glob.glob(newsreader_dir + "/*")] tmp_note = None for i, example in enumerate(zip(tml_files, gold_files)): tml, gold = example assert basename(tml) == basename(gold), "mismatch\n\ttml: {}\n\tgold:{}".format(tml, gold) print '\n\nprocessing file {}/{} {}'.format(i + 1, len(zip(tml_files, gold_files)), tml) if basename(tml) + ".parsed.pickle" in pickled_timeml_notes: tmp_note = cPickle.load(open(newsreader_dir + "/" + basename(tml) + ".parsed.pickle", "rb")) else: if timenote_imported is False: from code.notes.TimeNote import TimeNote timenote_imported = True tmp_note = TimeNote(tml, gold) cPickle.dump(tmp_note, open(newsreader_dir + "/" + basename(tml) + ".parsed.pickle", "wb")) notes.append(tmp_note) data = network._get_training_input(notes) mod = network.train_model(None, epochs=1, training_input=data) return mod
def trainNetwork(tml_files, gold_files, newsreader_dir, two_pass=True): ''' train::trainNetwork() Purpose: Train a neural network for classification of temporal realtions. Assumes events and timexes will be provided at prediction time @param tml_files: List of unlabled (no timex, etc) timeML documents @param gold_files: Fully labeled gold standard timeML documents ''' print "Called trainNetwork" global timenote_imported # Read in notes notes = [] basename = lambda x: os.path.basename(x[0:x.index(".tml")]) pickled_timeml_notes = [os.path.basename(l) for l in glob.glob(newsreader_dir + "/*")] tmp_note = None for i, example in enumerate(zip(tml_files, gold_files)): tml, gold = example assert basename(tml) == basename(gold), "mismatch\n\ttml: {}\n\tgold:{}".format(tml, gold) print '\n\nprocessing file {}/{} {}'.format(i + 1, len(zip(tml_files, gold_files)), tml) if basename(tml) + ".parsed.pickle" in pickled_timeml_notes: tmp_note = cPickle.load(open(newsreader_dir + "/" + basename(tml) + ".parsed.pickle", "rb")) else: if timenote_imported is False: from code.notes.TimeNote import TimeNote timenote_imported = True tmp_note = TimeNote(tml, gold) cPickle.dump(tmp_note, open(newsreader_dir + "/" + basename(tml) + ".parsed.pickle", "wb")) notes.append(tmp_note) if two_pass: detect_data = network._get_training_input(notes, presence=True, no_none=False) classify_data = network._get_training_input(notes, presence=False, no_none=True) detector = network.train_model(None, epochs=150, training_input=detect_data, weight_classes=False, batch_size=256, encoder_dropout=0, decoder_dropout=0, input_dropout=0.5, reg_W=0, reg_B=0, reg_act=0, LSTM_size=64, dense_size=100, maxpooling=True, data_dim=300, max_len='auto', nb_classes=2) # use max input length from detector max_len = detector.input_shape[2] classifier = network.train_model(None, epochs=500, training_input=classify_data, weight_classes=False, batch_size=256, encoder_dropout=0., decoder_dropout=0., input_dropout=0.5, reg_W=0, reg_B=0, reg_act=0, LSTM_size=64, dense_size=100, maxpooling=True, data_dim=300, max_len=max_len, nb_classes=5) return detector, classifier else: data = network._get_training_input(notes) NNet = network.train_model(None, epochs=150, training_input=data, weight_classes=False, batch_size=256, encoder_dropout=0, decoder_dropout=0, input_dropout=0.5, reg_W=0, reg_B=0, reg_act=0, LSTM_size=64, dense_size=100, maxpooling=True, data_dim=300, max_len='auto', nb_classes=6) return NNet