def __init__(self, word_rep_file=None, pickled_rep_reader=None): if pickled_rep_reader: self.rep_reader = pickled_rep_reader elif word_rep_file: self.rep_reader = RepReader(word_rep_file) self.input_size = self.rep_reader.rep_shape[0] self.tagger = None
def __init__(self, word_rep_file, train=False, cv=True, folds=5, modeltype="mlp", trained_model_name="trained_model.pkl", tagset_file="tagset.pkl"): self.trained_model_name = "%s_%s" % (modeltype, trained_model_name) self.cv = cv self.folds = folds self.rep_reader = RepReader(word_rep_file) self.input_size = self.rep_reader.rep_shape[0] if modeltype == "mlp": self.hidden_sizes = [20, 10] else: self.hidden_size = 20 self.max_iter = 100 self.learning_rate = 0.01 self.tag_index = None self.modeltype = modeltype if train: print >> sys.stderr, "Statement classifier initialized for training." if self.cv: print >> sys.stderr, "Cross-validation will be done" self.classifier = None else: self.classifier = cPickle.load(open(self.trained_model_name, "rb")) print >> sys.stderr, "Stored model loaded. Statement classifier initialized for prediction."
def __init__(self, word_rep_file, train=False, cv=True, folds=5, modeltype="mlp", trained_model_name="trained_model.pkl", tagset_file="tagset.pkl"): self.trained_model_name = "%s_%s"%(modeltype, trained_model_name) self.cv = cv self.folds = folds self.rep_reader = RepReader(word_rep_file) self.input_size = self.rep_reader.rep_shape[0] if modeltype == "mlp": self.hidden_sizes = [20, 10] else: self.hidden_size = 20 self.max_iter = 100 self.learning_rate = 0.01 self.tag_index = None self.modeltype = modeltype if train: print >>sys.stderr, "Statement classifier initialized for training." if self.cv: print >>sys.stderr, "Cross-validation will be done" self.classifier = None else: self.classifier = cPickle.load(open(self.trained_model_name, "rb")) print >>sys.stderr, "Stored model loaded. Statement classifier initialized for prediction."
class PassageTagger(object): def __init__(self, word_rep_file=None, pickled_rep_reader=None): if pickled_rep_reader: self.rep_reader = pickled_rep_reader elif word_rep_file: self.rep_reader = RepReader(word_rep_file) else: self.rep_reader = RepReader(elastic=True) self.input_size = self.rep_reader.rep_shape[0] self.tagger = None def make_data(self, clauses, use_attention, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): print >> sys.stderr, "Reading data.." str_seqs, label_seqs = read_passages(clauses, is_labeled=train) print >> sys.stderr, "Sample data for train:" if train else "Sample data for test:" print >> sys.stderr, zip(str_seqs[0], label_seqs[0]) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind seq_lengths = [len(seq) for seq in str_seqs] if not maxseqlen: maxseqlen = max(seq_lengths) if not maxclauselen: if use_attention: clauselens = [] for str_seq in str_seqs: clauselens.extend( [len(clause.split()) for clause in str_seq]) maxclauselen = max(clauselens) X = [] Y = [] Y_inds = [] #init_word_rep_len = len(self.rep_reader.word_rep) all_word_types = set([]) for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: self.label_ind[label] = len(self.label_ind) if use_attention: x = numpy.zeros((maxseqlen, maxclauselen, self.input_size)) else: x = numpy.zeros((maxseqlen, self.input_size)) y_ind = numpy.zeros(maxseqlen) seq_len = len(str_seq) # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training. if seq_len > maxseqlen: str_seq = str_seq[:maxseqlen] seq_len = maxseqlen if train: for i, (clause, label) in enumerate(zip(str_seq, label_seq)): clause_rep = self.rep_reader.get_clause_rep(clause) for word in clause.split(): all_word_types.add(word) if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len + i][-len(clause_rep):] = clause_rep else: x[-seq_len + i] = numpy.mean(clause_rep, axis=0) y_ind[-seq_len + i] = self.label_ind[label] X.append(x) Y_inds.append(y_ind) else: for i, clause in enumerate(str_seq): clause_rep = self.rep_reader.get_clause_rep(clause) for word in clause.split(): all_word_types.add(word) if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len + i][-len(clause_rep):] = clause_rep else: x[-seq_len + i] = numpy.mean(clause_rep, axis=0) X.append(x) final_word_rep_len = len(self.rep_reader.word_rep) #oov_ratio = float(final_word_rep_len - init_word_rep_len)/len(all_word_types) #print >>sys.stderr, "OOV ratio: %f" % oov_ratio for y_ind in Y_inds: y = numpy.zeros((maxseqlen, len(self.label_ind))) for i, y_ind_i in enumerate(y_ind): y[i][y_ind_i] = 1 Y.append(y) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} return seq_lengths, numpy.asarray(X), numpy.asarray(Y) def get_attention_weights(self, X_test): if not self.tagger: raise RuntimeError, "Tagger not trained yet!" inp = self.tagger.get_input() att_out = None for layer in self.tagger.layers: if layer.get_config()['name'].lower() == "tensorattention": att_out = layer.get_output() break if not att_out: raise RuntimeError, "No attention layer found!" f = theano.function([inp], att_out) return f(X_test) def predict(self, X, bidirectional, test_seq_lengths=None, tagger=None): if not tagger: tagger = self.tagger if not tagger: raise RuntimeError, "Tagger not trained yet!" if test_seq_lengths is None: # Determining actual lengths sans padding x_lens = [] for x in X: x_len = 0 for i, xi in enumerate(x): if xi.sum() != 0: x_len = len(x) - i break x_lens.append(x_len) else: x_lens = test_seq_lengths if bidirectional: pred_probs = tagger.predict({'input': X})['output'] else: pred_probs = tagger.predict(X) pred_inds = numpy.argmax(pred_probs, axis=2) pred_label_seqs = [] for pred_ind, x_len in zip(pred_inds, x_lens): pred_label_seq = [self.rev_label_ind[pred] for pred in pred_ind][-x_len:] # If the following number is positive, it means we ignored some clauses in the test passage to make it the same length as the ones we trained on. num_ignored_clauses = max(0, x_len - len(pred_label_seq)) # Make labels for those if needed. if num_ignored_clauses > 0: warnings.warn( "Test sequence too long. Ignoring %d clauses at the beginning and labeling them none." % num_ignored_clauses) ignored_clause_labels = ["none"] * num_ignored_clauses pred_label_seq = ignored_clause_labels + pred_label_seq pred_label_seqs.append(pred_label_seq) return pred_probs, pred_label_seqs, x_lens def fit_model(self, X, Y, use_attention, att_context, bidirectional): print >> sys.stderr, "Input shape:", X.shape, Y.shape early_stopping = EarlyStopping(patience=2) num_classes = len(self.label_ind) if bidirectional: tagger = Graph() tagger.add_input(name='input', input_shape=X.shape[1:]) if use_attention: tagger.add_node(TensorAttention(X.shape[1:], context=att_context), name='attention', input='input') lstm_input_node = 'attention' else: lstm_input_node = 'input' tagger.add_node(LSTM(X.shape[-1] / 2, return_sequences=True), name='forward', input=lstm_input_node) tagger.add_node(LSTM(X.shape[-1] / 2, return_sequences=True, go_backwards=True), name='backward', input=lstm_input_node) tagger.add_node(TimeDistributedDense(num_classes, activation='softmax'), name='softmax', inputs=['forward', 'backward'], merge_mode='concat', concat_axis=-1) tagger.add_output(name='output', input='softmax') tagger.summary() tagger.compile('adam', {'output': 'categorical_crossentropy'}) #tagger.fit({'input':X, 'output':Y}, validation_split=0.1, callbacks=[early_stopping], show_accuracy=True, nb_epoch=100, batch_size=10) tagger.fit({ 'input': X, 'output': Y }, validation_split=0.1, callbacks=[early_stopping], nb_epoch=100, batch_size=10) else: tagger = Sequential() word_proj_dim = 50 if use_attention: _, input_len, timesteps, input_dim = X.shape tagger.add( HigherOrderTimeDistributedDense(input_dim=input_dim, output_dim=word_proj_dim)) att_input_shape = (input_len, timesteps, word_proj_dim) print >> sys.stderr, "Attention input shape:", att_input_shape tagger.add(Dropout(0.5)) tagger.add( TensorAttention(att_input_shape, context=att_context)) else: _, input_len, input_dim = X.shape tagger.add( TimeDistributedDense(input_dim=input_dim, input_length=input_len, output_dim=word_proj_dim)) tagger.add( LSTM(input_dim=word_proj_dim, output_dim=word_proj_dim, input_length=input_len, return_sequences=True)) tagger.add(TimeDistributedDense(num_classes, activation='softmax')) tagger.summary() tagger.compile(loss='categorical_crossentropy', optimizer='adam') tagger.fit(X, Y, validation_split=0.1, callbacks=[early_stopping], show_accuracy=True, batch_size=10) return tagger def train(self, X, Y, use_attention, att_context, bidirectional, cv=True, folds=5): if cv: cv_folds = make_folds(X, Y, folds) accuracies = [] fscores = [] for fold_num, ((train_fold_X, train_fold_Y), (test_fold_X, test_fold_Y)) in enumerate(cv_folds): tagger = self.fit_model(train_fold_X, train_fold_Y, use_attention, att_context, bidirectional) pred_probs, pred_label_seqs, x_lens = self.predict( test_fold_X, bidirectional, tagger=tagger) pred_inds = numpy.argmax(pred_probs, axis=2) flattened_preds = [] flattened_targets = [] for x_len, pred_ind, test_target in zip( x_lens, pred_inds, test_fold_Y): flattened_preds.extend(pred_ind[-x_len:]) flattened_targets.extend( [list(tt).index(1) for tt in test_target[-x_len:]]) assert len(flattened_preds) == len(flattened_targets) accuracy, weighted_fscore, all_fscores = evaluate( flattened_targets, flattened_preds) print >> sys.stderr, "Finished fold %d. Accuracy: %f, Weighted F-score: %f" % ( fold_num, accuracy, weighted_fscore) print >> sys.stderr, "Individual f-scores:" for cat in all_fscores: print >> sys.stderr, "%s: %f" % (self.rev_label_ind[cat], all_fscores[cat]) accuracies.append(accuracy) fscores.append(weighted_fscore) accuracies = numpy.asarray(accuracies) fscores = numpy.asarray(fscores) print >> sys.stderr, "Accuracies:", accuracies print >> sys.stderr, "Average: %0.4f (+/- %0.4f)" % ( accuracies.mean(), accuracies.std() * 2) print >> sys.stderr, "Fscores:", fscores print >> sys.stderr, "Average: %0.4f (+/- %0.4f)" % ( fscores.mean(), fscores.std() * 2) self.tagger = self.fit_model(X, Y, use_attention, att_context, bidirectional) model_ext = "att=%s_cont=%s_bi=%s" % (str(use_attention), att_context, str(bidirectional)) model_config_file = open("model_%s_config.json" % model_ext, "w") model_weights_file_name = "model_%s_weights" % model_ext model_label_ind = "model_%s_label_ind.json" % model_ext model_rep_reader = "model_%s_rep_reader.pkl" % model_ext print >> model_config_file, self.tagger.to_json() self.tagger.save_weights(model_weights_file_name, overwrite=True) json.dump(self.label_ind, open(model_label_ind, "w"))
class PassageTagger(object): def __init__(self, word_rep_file=None, pickled_rep_reader=None): if pickled_rep_reader: self.rep_reader = pickled_rep_reader elif word_rep_file: self.rep_reader = RepReader(word_rep_file) try: self.input_size = self.rep_reader.rep_shape[0] except: self.input_size = 0 self.tagger = None def make_data(self, trainfilename, use_attention, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): # list of list str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind seq_lengths = [len(seq) for seq in str_seqs] if not maxseqlen: maxseqlen = max(seq_lengths) if not maxclauselen: if use_attention: clauselens = [] for str_seq in str_seqs: clauselens.extend( [len(clause.split()) for clause in str_seq]) maxclauselen = max(clauselens) X = [] Y = [] Y_inds = [] init_word_rep_len = len(self.rep_reader.word_rep) # Vocab size all_word_types = set([]) for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: # Add new labels with values 0,1,2,.... self.label_ind[label] = len(self.label_ind) if use_attention: x = np.zeros((maxseqlen, maxclauselen, self.input_size)) else: x = np.zeros((maxseqlen, self.input_size)) y_ind = np.zeros(maxseqlen) seq_len = len(str_seq) # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training. if seq_len > maxseqlen: str_seq = str_seq[:maxseqlen] seq_len = maxseqlen if train: for i, (clause, label) in enumerate(zip(str_seq, label_seq)): clause_rep = self.rep_reader.get_clause_rep( clause ) # Makes embedding non-trainable from the beginning. for word in clause.split(): all_word_types.add(word) # Vocab if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len + i][-len(clause_rep):] = clause_rep else: x[-seq_len + i] = np.mean(clause_rep, axis=0) y_ind[-seq_len + i] = self.label_ind[label] X.append(x) Y_inds.append(y_ind) else: for i, clause in enumerate(str_seq): clause_rep = self.rep_reader.get_clause_rep(clause) for word in clause.split(): all_word_types.add(word) if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len + i][-len(clause_rep):] = clause_rep else: x[-seq_len + i] = np.mean(clause_rep, axis=0) X.append(x) # Once there is OOV, new word vector is added to word_rep final_word_rep_len = len(self.rep_reader.word_rep) oov_ratio = float(final_word_rep_len - init_word_rep_len) / len(all_word_types) for y_ind in Y_inds: y = np.zeros((maxseqlen, len(self.label_ind))) for i, y_ind_i in enumerate(y_ind): y[i][y_ind_i.astype(int)] = 1 Y.append(y) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} return seq_lengths, np.asarray(X), np.asarray( Y) # One-hot representation of labels def make_data_cached_elmo(self, use_attention, maxseqlen=300, maxclauselen=30, label_ind=None, train=False): textpath = "/nas/home/xiangcil/bio_corpus/Molecular_Interaction_Evidence_Fragment_Corpus/02_expt_spans_complete/pathway_logic/" all_texts = glob.glob(textpath + "*.tsv") cachepath = "/nas/home/xiangcil/bio_corpus/Elmo_Cached_Molecular_Interaction_Evidence_Fragment_Corpus/pathway_logic/" if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind label_seqs = [] label_seq = [] elmo_layer = 3 embedding_dim = 1024 X = np.zeros((0, maxclauselen, maxseqlen, embedding_dim * elmo_layer)) for filename in all_texts: shortfilename = filename.split("/")[-1].split(".")[0] embedding_numpy_file = cachepath + shortfilename + ".npy" X_paper = np.load(embedding_numpy_file) X = np.append(X, X_paper, axis=0) df = pd.read_csv(filename, sep='\t', header=0, index_col=0, engine='python') df = df[pd.notnull(df["Discourse Type"])] num_rec = df.shape[0] prev_paragraph = "" for i in range(num_rec): if df["Paragraph"][i][0] == "p": # e.g. "p1" if df["Paragraph"][i] != prev_paragraph: prev_paragraph = df["Paragraph"][i] if len(label_seq) > 0: label_seqs.append(label_seq) label_seq = [] clause_count = 0 if clause_count < maxclauselen: label_seq.append(df["Discourse Type"][i]) clause_count += 1 print("Loading pkl file: ", shortfilename) if not use_attention: X = np.mean(X, axis=2) seq_lengths = [len(label_seq) for label_seq in label_seqs] Y = [] Y_inds = [] for label_seq in label_seqs: for label in label_seq: if label not in self.label_ind: # Add new labels with values 0,1,2,.... self.label_ind[label] = len(self.label_ind) y_ind = np.zeros(maxseqlen) seq_len = len(label_seq) if train: for i, label in enumerate(label_seq): y_ind[-seq_len + i] = self.label_ind[label] Y_inds.append(y_ind) for y_ind in Y_inds: y = np.zeros((maxseqlen, len(self.label_ind))) for i, y_ind_i in enumerate(y_ind): y[i][y_ind_i.astype(int)] = 1 Y.append(y) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} return seq_lengths, X, np.asarray(Y) def get_attention_weights(self, X_test): if not self.tagger: raise (RuntimeError, "Tagger not trained yet!") inp = self.tagger.get_input() att_out = None for layer in self.tagger.layers: if layer.get_config()['name'].lower() == "tensorattention": att_out = layer.get_output() break if not att_out: raise (RuntimeError, "No attention layer found!") f = theano.function([inp], att_out) return f(X_test) def predict(self, X, bidirectional, test_seq_lengths=None, tagger=None): if not tagger: tagger = self.tagger if not tagger: raise (RuntimeError, "Tagger not trained yet!") if test_seq_lengths is None: # Determining actual lengths sans padding x_lens = [] for x in X: x_len = 0 for i, xi in enumerate(x): if xi.sum() != 0: x_len = len(x) - i break x_lens.append(x_len) else: x_lens = test_seq_lengths if bidirectional: pred_probs = tagger.predict(X) else: pred_probs = tagger.predict(X) pred_inds = np.argmax(pred_probs, axis=2) pred_label_seqs = [] for pred_ind, x_len in zip(pred_inds, x_lens): pred_label_seq = [self.rev_label_ind[pred] for pred in pred_ind][-x_len:] # If the following number is positive, it means we ignored some clauses in the test passage to make it the same length as the ones we trained on. num_ignored_clauses = max(0, x_len - len(pred_label_seq)) # Make labels for those if needed. if num_ignored_clauses > 0: warnings.warn( "Test sequence too long. Ignoring %d clauses at the beginning and labeling them none." % num_ignored_clauses) ignored_clause_labels = ["none"] * num_ignored_clauses pred_label_seq = ignored_clause_labels + pred_label_seq pred_label_seqs.append(pred_label_seq) return pred_probs, pred_label_seqs, x_lens def fit_model(self, X, Y, use_attention, att_context, bidirectional, crf): early_stopping = EarlyStopping(patience=2) num_classes = len(self.label_ind) tagger = Sequential() word_proj_dim = 50 if use_attention: sample_size, input_len, timesteps, input_dim = X.shape self.td1 = input_len self.td2 = timesteps tagger.add( HigherOrderTimeDistributedDense(input_dim=input_dim, output_dim=word_proj_dim)) att_input_shape = (sample_size, input_len, timesteps, word_proj_dim) tagger.add(Dropout(0.5)) tagger.add(TensorAttention(att_input_shape, context=att_context)) else: _, input_len, input_dim = X.shape tagger.add( TimeDistributed(Dense(input_dim=input_dim, units=word_proj_dim))) if bidirectional: tagger.add( Bidirectional( LSTM(input_shape=(input_len, word_proj_dim), units=word_proj_dim, return_sequences=True))) else: tagger.add( LSTM(input_shape=(input_len, word_proj_dim), units=word_proj_dim, return_sequences=True)) tagger.add(TimeDistributed(Dense(num_classes, activation='softmax'))) def step_decay(epoch): initial_lrate = 0.1 drop = 0.5 epochs_drop = 5.0 lrate = initial_lrate * np.power( drop, np.floor((1 + epoch) / epochs_drop)) return lrate epoch = 100 if crf: crf = CRF(num_classes, learn_mode="marginal") tagger.add(crf) #rmsprop = RMSprop(lr=0.05, rho=0.9, epsilon=None, decay=0.99) #lr = 0.1 #decay = lr / epoch #sgd = SGD(lr=lr, decay=decay, momentum=0.9, nesterov=True) tagger.compile(optimizer='rmsprop', loss=crf.loss_function, metrics=[crf.accuracy]) else: tagger.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #tagger.fit(X, Y, validation_split=0.1, epochs=100, callbacks=[early_stopping], batch_size=10) #tagger.fit(X, Y, validation_split=0.1, epochs=epoch, batch_size=10, callbacks = [LearningRateScheduler(step_decay)]) tagger.fit(X, Y, validation_split=0.1, epochs=epoch, batch_size=10) tagger.summary() return tagger def train(self, X, Y, use_attention, att_context, bidirectional, cv=True, folds=5, crf=False): if cv: cv_folds = make_folds(X, Y, folds) accuracies = [] fscores = [] for fold_num, ((train_fold_X, train_fold_Y), (test_fold_X, test_fold_Y)) in enumerate(cv_folds): self.tagger = self.fit_model(train_fold_X, train_fold_Y, use_attention, att_context, bidirectional, crf) pred_probs, pred_label_seqs, x_lens = self.predict( test_fold_X, bidirectional, tagger=self.tagger) pred_inds = np.argmax(pred_probs, axis=2) flattened_preds = [] flattened_targets = [] for x_len, pred_ind, test_target in zip( x_lens, pred_inds, test_fold_Y): flattened_preds.extend(pred_ind[-x_len:]) flattened_targets.extend( [list(tt).index(1) for tt in test_target[-x_len:]]) assert len(flattened_preds) == len(flattened_targets) accuracy, weighted_fscore, all_fscores = evaluate( flattened_targets, flattened_preds) print("Finished fold %d. Accuracy: %f, Weighted F-score: %f" % (fold_num, accuracy, weighted_fscore)) print("Individual f-scores:") for cat in all_fscores: print("%s: %f" % (self.rev_label_ind[cat], all_fscores[cat])) accuracies.append(accuracy) fscores.append(weighted_fscore) accuracies = np.asarray(accuracies) fscores = np.asarray(fscores) print("Accuracies:", accuracies) print("Average: %0.4f (+/- %0.4f)" % (accuracies.mean(), accuracies.std() * 2)) print(sys.stderr, "Fscores:", fscores) print( sys.stderr, "Average: %0.4f (+/- %0.4f)" % (fscores.mean(), fscores.std() * 2)) else: self.tagger = self.fit_model(X, Y, use_attention, att_context, bidirectional, crf) model_ext = "att=%s_cont=%s_bi=%s" % (str(use_attention), att_context, str(bidirectional)) model_config_file = open("model_%s_config.json" % model_ext, "w") model_weights_file_name = "model_%s_weights" % model_ext model_label_ind = "model_%s_label_ind.json" % model_ext model_rep_reader = "model_%s_rep_reader.pkl" % model_ext self.tagger.save_weights(model_weights_file_name, overwrite=True) json.dump(self.label_ind, open(model_label_ind, "w")) pickle.dump(self.rep_reader, open(model_rep_reader, "wb"))
class StatementClassifier(object): def __init__(self, word_rep_file, train=False, cv=True, folds=5, modeltype="mlp", trained_model_name="trained_model.pkl", tagset_file="tagset.pkl"): self.trained_model_name = "%s_%s"%(modeltype, trained_model_name) self.cv = cv self.folds = folds self.rep_reader = RepReader(word_rep_file) self.input_size = self.rep_reader.rep_shape[0] if modeltype == "mlp": self.hidden_sizes = [20, 10] else: self.hidden_size = 20 self.max_iter = 100 self.learning_rate = 0.01 self.tag_index = None self.modeltype = modeltype if train: print >>sys.stderr, "Statement classifier initialized for training." if self.cv: print >>sys.stderr, "Cross-validation will be done" self.classifier = None else: self.classifier = cPickle.load(open(self.trained_model_name, "rb")) print >>sys.stderr, "Stored model loaded. Statement classifier initialized for prediction." def make_data(self, trainfile_name): print >>sys.stderr, "Reading data.." train_data = [tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8")] shuffle(train_data) train_labels, train_clauses = zip(*train_data) train_labels = [tl.lower() for tl in train_labels] tagset = list(set(train_labels)) if not self.tag_index: self.tag_index = {l:i for (i, l) in enumerate(tagset)} Y = numpy.asarray([self.tag_index[label] for label in train_labels]) if self.modeltype=="mlp": X = numpy.asarray([numpy.mean(self.rep_reader.get_clause_rep(clause.lower()), axis=0) for clause in train_clauses]) else: X = numpy.asarray([self.rep_reader.get_clause_rep(clause.lower()) for clause in train_clauses]) return X, Y, len(tagset) def classify(self, classifier, X): output_func = classifier.get_output_func() predictions = [numpy.argmax(output_func(x)) for x in X] return predictions def fit_model(self, X, Y, num_classes): if self.modeltype == "mlp": classifier = MLP(self.input_size, self.hidden_sizes, num_classes) else: classifier = RNN(self.input_size, self.hidden_size, num_classes) train_func = classifier.get_train_func(self.learning_rate) for num_iter in range(self.max_iter): for x, y in zip(X, Y): train_func(x, y) return classifier def train(self, trainfile_name): train_X, train_Y, num_classes = self.make_data(trainfile_name) accuracies = [] fscores = [] if self.cv: num_points = train_X.shape[0] fol_len = num_points / self.folds rem = num_points % self.folds X_folds = numpy.split(train_X, self.folds) if rem == 0 else numpy.split(train_X[:-rem], self.folds) Y_folds = numpy.split(train_Y, self.folds) if rem == 0 else numpy.split(train_Y[:-rem], self.folds) for i in range(self.folds): train_folds_X = [] train_folds_Y = [] for j in range(self.folds): if i != j: train_folds_X.append(X_folds[j]) train_folds_Y.append(Y_folds[j]) train_fold_X = numpy.concatenate(train_folds_X) train_fold_Y = numpy.concatenate(train_folds_Y) classifier = self.fit_model(train_fold_X, train_fold_Y, num_classes) predictions = self.classify(classifier, X_folds[i]) accuracy, weighted_fscore, _ = self.evaluate(Y_folds[i], predictions) accuracies.append(accuracy) fscores.append(weighted_fscore) accuracies = numpy.asarray(accuracies) fscores = numpy.asarray(fscores) print >>sys.stderr, "Accuracies:", accuracies print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(accuracies.mean(), accuracies.std() * 2) print >>sys.stderr, "Fscores:", fscores print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(fscores.mean(), fscores.std() * 2) self.classifier = self.fit_model(train_X, train_Y, num_classes) cPickle.dump(classifier, open(self.trained_model_name, "wb")) #pickle.dump(tagset, open(self.stored_tagset, "wb")) print >>sys.stderr, "Done" def evaluate(self, y, pred): accuracy = float(sum([c == p for c, p in zip(y, pred)]))/len(pred) num_gold = {} num_pred = {} num_correct = {} for c, p in zip(y, pred): if c in num_gold: num_gold[c] += 1 else: num_gold[c] = 1 if p in num_pred: num_pred[p] += 1 else: num_pred[p] = 1 if c == p: if c in num_correct: num_correct[c] += 1 else: num_correct[c] = 1 fscores = {} for p in num_pred: precision = float(num_correct[p]) / num_pred[p] if p in num_correct else 0.0 recall = float(num_correct[p]) / num_gold[p] if p in num_correct else 0.0 fscores[p] = 2 * precision * recall / (precision + recall) if precision !=0 and recall !=0 else 0.0 weighted_fscore = sum([fscores[p] * num_gold[p] if p in num_gold else 0.0 for p in fscores]) / sum(num_gold.values()) return accuracy, weighted_fscore, fscores
parser.add_argument('-i', '--inFile', help='Input File') parser.add_argument('-t', '--textColumn', help='Name of text column') parser.add_argument('-l', '--labelColumn', help='Name of text column') parser.add_argument('-e', '--esIndex', help='ElasticSearch Index Name') parser.add_argument('-m', '--modelFile', help='Keras model file') ''' ''' SIGNATURE FOR ADDING FLAGS add_boolean_argument(parser, 'full_text_pdf') ''' args = parser.parse_args() base_dir = '/Users/Gully/Documents/Projects/2_active/corpora_local/intact/2018-04-17-cleanup/' index_name = 'oa_all_fasttext' model_file_name = 'i_meth_label.model.h5' rep_reader = RepReader(index_name=index_name, elastic=True) # From https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/input_fn/boston.py COLUMNS = ["ID", "i_meth", "p_meth", "pmid", "subfig", "text"] FEATURES = ["text"] LABEL = "p_meth" interaction_df = pd.read_csv(base_dir + 'ontologies/i_meth_codes.tsv', sep='\t', names=['text', 'uri', 'label'], index_col=0) interaction_df participant_df = pd.read_csv(base_dir + 'ontologies/p_meth_codes.tsv', sep='\t', names=['text', 'uri', 'label'],
parser.add_argument('inFile', help='Input File') parser.add_argument('textColumn', help='Name of text column') parser.add_argument('labelColumn', help='Name of text column') parser.add_argument('testSize', help='Size of held-out test set') parser.add_argument('--kerasFile', help='Keras model file') parser.add_argument('--esIndex', help='ElasticSearch Representation Index Name') parser.add_argument('--repFile', help='Representation File Path') add_boolean_argument(parser, 'randomizeTestSet') args = parser.parse_args() rep_reader = None if args.repFile is not None: rep_reader = RepReader(embedding_file=args.repFile, elastic=False) elif args.esIndex is not None: rep_reader = RepReader(index_name=args.esIndex, elastic=True) else: raise ValueError( "You must specify either kerasFile or esIndex. Neither specified.") sd = SpreadsheetData(args.inFile, args.textColumn, args.labelColumn, args.testSize, args.randomizeTestSet) # embedding matrix print('preparing embedding matrix...') words_not_found = [] nb_words = min(sd.MAX_NB_WORDS, len(sd.word_index) + 1) embed_dim = rep_reader.rep_shape[0] embedding_matrix = np.zeros((nb_words, embed_dim))
class StatementClassifier(object): def __init__(self, word_rep_file, train=False, cv=True, folds=5, modeltype="mlp", trained_model_name="trained_model.pkl", tagset_file="tagset.pkl"): self.trained_model_name = "%s_%s"%(modeltype, trained_model_name) self.cv = cv self.folds = folds self.rep_reader = RepReader(word_rep_file) self.input_size = self.rep_reader.rep_shape[0] if modeltype == "mlp": self.hidden_sizes = [20, 10] else: self.hidden_size = 20 self.max_iter = 100 self.learning_rate = 0.01 self.tag_index = None self.modeltype = modeltype if train: print >>sys.stderr, "Statement classifier initialized for training." if self.cv: print >>sys.stderr, "Cross-validation will be done" self.classifier = None else: self.classifier = cPickle.load(open(self.trained_model_name, "rb")) print >>sys.stderr, "Stored model loaded. Statement classifier initialized for prediction." def make_data(self, trainfile_name): print >>sys.stderr, "Reading data.." train_data = [tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8")] shuffle(train_data) train_clauses, train_labels = zip(*train_data) train_labels = [tl.lower() for tl in train_labels] tagset = list(set(train_labels)) if not self.tag_index: self.tag_index = {l:i for (i, l) in enumerate(tagset)} Y = numpy.asarray([self.tag_index[label] for label in train_labels]) if self.modeltype=="mlp": X = numpy.asarray([numpy.mean(self.rep_reader.get_clause_rep(clause.lower()), axis=0) for clause in train_clauses], dtype='float32') elif self.modeltype == "rnn": X = numpy.asarray([numpy.asarray(self.rep_reader.get_clause_rep(clause.lower()), dtype='float32') for clause in train_clauses]) elif self.modeltype == "lstm": clause_reps = [self.rep_reader.get_clause_rep(clause.lower()) for clause in train_clauses] maxlen = max([len(clause_rep) for clause_rep in clause_reps]) # Padding X with zeros at the end to make all sequences of same length X = numpy.zeros((len(train_clauses), maxlen, max(self.rep_reader.rep_shape))) for i in range(len(clause_reps)): x_len = len(clause_reps[i]) X[i][-x_len:] = clause_reps[i] return X, Y, len(tagset) def classify(self, classifier, X): if self.modeltype == "mlp" or self.modeltype == "rnn": output_func = classifier.get_output_func() predictions = [numpy.argmax(output_func(x)) for x in X] elif self.modeltype == "lstm": predictions = [numpy.argmax(classifier.predict(numpy.asarray([x]))) for x in X] return predictions def fit_model(self, X, Y, num_classes): if self.modeltype == "mlp" or self.modeltype == "rnn": if self.modeltype == "mlp": classifier = MLP(self.input_size, self.hidden_sizes, num_classes) else: classifier = RNN(self.input_size, self.hidden_size, num_classes) train_func = classifier.get_train_func(self.learning_rate) for num_iter in range(self.max_iter): for x, y in zip(X, Y): train_func(x, y) elif self.modeltype == "lstm": classifier = Sequential() classifier.add(LSTM(input_dim=self.input_size, output_dim=self.input_size/2)) #classifier.add(Dropout(0.3)) classifier.add(Dense(num_classes, activation='softmax')) classifier.compile(loss='categorical_crossentropy', optimizer='adam') Y_indexed = numpy.zeros((len(Y), num_classes)) for i in range(len(Y)): Y_indexed[i][Y[i]] = 1 classifier.fit(X, Y_indexed, nb_epoch=20) return classifier def train(self, trainfile_name): train_X, train_Y, num_classes = self.make_data(trainfile_name) accuracies = [] fscores = [] if self.cv: cv_folds = make_folds(train_X, train_Y, self.folds) for i, ((train_fold_X, train_fold_Y), (test_fold_X, test_fold_Y)) in enumerate(cv_folds): classifier = self.fit_model(train_fold_X, train_fold_Y, num_classes) predictions = self.classify(classifier, test_fold_X) accuracy, weighted_fscore, _ = evaluate(test_fold_Y, predictions) print >>sys.stderr, "Finished fold %d. Accuracy: %f, F-score: %f"%(i, accuracy, weighted_fscore) accuracies.append(accuracy) fscores.append(weighted_fscore) accuracies = numpy.asarray(accuracies) fscores = numpy.asarray(fscores) print >>sys.stderr, "Accuracies:", accuracies print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(accuracies.mean(), accuracies.std() * 2) print >>sys.stderr, "Fscores:", fscores print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(fscores.mean(), fscores.std() * 2) #self.classifier = self.fit_model(train_X, train_Y, num_classes) #cPickle.dump(classifier, open(self.trained_model_name, "wb")) #pickle.dump(tagset, open(self.stored_tagset, "wb")) print >>sys.stderr, "Done"
def __init__(self, word_rep_file): self.rep_reader = RepReader(word_rep_file) self.input_size = self.rep_reader.rep_shape[0] self.tagger = None
class PassageTagger(object): def __init__(self, word_rep_file): self.rep_reader = RepReader(word_rep_file) self.input_size = self.rep_reader.rep_shape[0] self.tagger = None def make_data(self, trainfilename, use_attention, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): print >>sys.stderr, "Reading data.." str_seqs, label_seqs = read_passages(trainfilename, train) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind if not maxseqlen: maxseqlen = max([len(label_seq) for label_seq in label_seqs]) if not maxclauselen: if use_attention: clauselens = [] for str_seq in str_seqs: clauselens.extend([len(clause.split()) for clause in str_seq]) maxclauselen = max(clauselens) X = [] Y = [] Y_inds = [] for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: self.label_ind[label] = len(self.label_ind) if use_attention: x = numpy.zeros((maxseqlen, maxclauselen, self.input_size)) else: x = numpy.zeros((maxseqlen, self.input_size)) y_ind = numpy.zeros(maxseqlen) seq_len = len(str_seq) # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training. if seq_len > maxseqlen: str_seq = str_seq[:maxseqlen] seq_len = maxseqlen if train: for i, (clause, label) in enumerate(zip(str_seq, label_seq)): clause_rep = self.rep_reader.get_clause_rep(clause) if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len+i][-len(clause_rep):] = clause_rep else: x[-seq_len+i] = numpy.mean(clause_rep, axis=0) y_ind[-seq_len+i] = self.label_ind[label] X.append(x) Y_inds.append(y_ind) else: for i, clause in enumerate(str_seq): clause_rep = self.rep_reader.get_clause_rep(clause) if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len+i][-len(clause_rep):] = clause_rep else: x[-seq_len+i] = numpy.mean(clause_rep, axis=0) X.append(x) for y_ind in Y_inds: y = numpy.zeros((maxseqlen, len(self.label_ind))) for i, y_ind_i in enumerate(y_ind): y[i][y_ind_i] = 1 Y.append(y) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} return numpy.asarray(X), numpy.asarray(Y) def get_attention_weights(self, X_test): if not self.tagger: raise RuntimeError, "Tagger not trained yet!" inp = self.tagger.get_input() att_out = None for layer in self.tagger.layers: if layer.get_config()['name'].lower() == "tensorattention": att_out = layer.get_output() break if not att_out: raise RuntimeError, "No attention layer found!" f = theano.function([inp], att_out) return f(X_test) def predict(self, X, bidirectional, tagger=None): if not tagger: tagger = self.tagger if not tagger: raise RuntimeError, "Tagger not trained yet!" # Determining actual lengths sans padding x_lens = [] for x in X: x_len = 0 for i, xi in enumerate(x): if xi.sum() != 0: x_len = len(x) - i break x_lens.append(x_len) if bidirectional: pred_probs = tagger.predict({'input':X})['output'] else: pred_probs = tagger.predict(X) pred_inds = numpy.argmax(pred_probs, axis=2) pred_label_seqs = [] for pred_ind, x_len in zip(pred_inds, x_lens): pred_label_seq = [self.rev_label_ind[pred] for pred in pred_ind][-x_len:] pred_label_seqs.append(pred_label_seq) return pred_probs, pred_label_seqs, x_lens def fit_model(self, X, Y, use_attention, att_context, bidirectional): print >>sys.stderr, "Input shape:", X.shape, Y.shape num_classes = len(self.label_ind) if bidirectional: tagger = Graph() tagger.add_input(name='input', input_shape=X.shape[1:]) if use_attention: tagger.add_node(TensorAttention(X.shape[1:], context=att_context), name='attention', input='input') lstm_input_node = 'attention' else: lstm_input_node = 'input' tagger.add_node(LSTM(X.shape[-1]/2, return_sequences=True), name='forward', input=lstm_input_node) tagger.add_node(LSTM(X.shape[-1]/2, return_sequences=True, go_backwards=True), name='backward', input=lstm_input_node) tagger.add_node(TimeDistributedDense(num_classes, activation='softmax'), name='softmax', inputs=['forward', 'backward'], merge_mode='concat', concat_axis=-1) tagger.add_output(name='output', input='softmax') print >>sys.stderr, tagger.summary() tagger.compile('adam', {'output':'categorical_crossentropy'}) tagger.fit({'input':X, 'output':Y}) else: tagger = Sequential() word_proj_dim = 50 if use_attention: _, input_len, timesteps, input_dim = X.shape tagger.add(HigherOrderTimeDistributedDense(input_dim=input_dim, output_dim=word_proj_dim)) att_input_shape = (input_len, timesteps, word_proj_dim) print >>sys.stderr, "Attention input shape:", att_input_shape tagger.add(Dropout(0.5)) tagger.add(TensorAttention(att_input_shape, context=att_context)) #tagger.add(Dropout(0.5)) else: _, input_len, input_dim = X.shape tagger.add(TimeDistributedDense(input_dim=input_dim, output_dim=word_proj_dim)) tagger.add(LSTM(input_dim=word_proj_dim, output_dim=word_proj_dim, input_length=input_len, return_sequences=True)) tagger.add(TimeDistributedDense(num_classes, activation='softmax')) print >>sys.stderr, tagger.summary() tagger.compile(loss='categorical_crossentropy', optimizer='adam') tagger.fit(X, Y, batch_size=10) return tagger def train(self, X, Y, use_attention, att_context, bidirectional, cv=True, folds=5): if cv: cv_folds = make_folds(X, Y, folds) accuracies = [] fscores = [] for fold_num, ((train_fold_X, train_fold_Y), (test_fold_X, test_fold_Y)) in enumerate(cv_folds): tagger = self.fit_model(train_fold_X, train_fold_Y, use_attention, att_context, bidirectional) pred_probs, pred_label_seqs, x_lens = self.predict(test_fold_X, bidirectional, tagger) pred_inds = numpy.argmax(pred_probs, axis=2) flattened_preds = [] flattened_targets = [] for x_len, pred_ind, test_target in zip(x_lens, pred_inds, test_fold_Y): flattened_preds.extend(pred_ind[-x_len:]) flattened_targets.extend([list(tt).index(1) for tt in test_target[-x_len:]]) assert len(flattened_preds) == len(flattened_targets) accuracy, weighted_fscore, all_fscores = evaluate(flattened_targets, flattened_preds) print >>sys.stderr, "Finished fold %d. Accuracy: %f, Weighted F-score: %f"%(fold_num, accuracy, weighted_fscore) print >>sys.stderr, "Individual f-scores:" for cat in all_fscores: print >>sys.stderr, "%s: %f"%(self.rev_label_ind[cat], all_fscores[cat]) accuracies.append(accuracy) fscores.append(weighted_fscore) accuracies = numpy.asarray(accuracies) fscores = numpy.asarray(fscores) print >>sys.stderr, "Accuracies:", accuracies print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(accuracies.mean(), accuracies.std() * 2) print >>sys.stderr, "Fscores:", fscores print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(fscores.mean(), fscores.std() * 2) self.tagger = self.fit_model(X, Y, use_attention, att_context, bidirectional) model_ext = "att=%s_cont=%s_bi=%s"%(str(use_attention), att_context, str(bidirectional)) model_config_file = open("model_%s_config.json"%model_ext, "w") model_weights_file_name = "model_%s_weights"%model_ext model_label_ind = "model_%s_label_ind.json"%model_ext print >>model_config_file, self.tagger.to_json() self.tagger.save_weights(model_weights_file_name) json.dump(self.label_ind, open(model_label_ind, "w"))
class StatementClassifier(object): def __init__(self, word_rep_file, train=False, cv=True, folds=5, modeltype="mlp", trained_model_name="trained_model.pkl", tagset_file="tagset.pkl"): self.trained_model_name = "%s_%s" % (modeltype, trained_model_name) self.cv = cv self.folds = folds self.rep_reader = RepReader(word_rep_file) self.input_size = self.rep_reader.rep_shape[0] if modeltype == "mlp": self.hidden_sizes = [20, 10] else: self.hidden_size = 20 self.max_iter = 100 self.learning_rate = 0.01 self.tag_index = None self.modeltype = modeltype if train: print >> sys.stderr, "Statement classifier initialized for training." if self.cv: print >> sys.stderr, "Cross-validation will be done" self.classifier = None else: self.classifier = cPickle.load(open(self.trained_model_name, "rb")) print >> sys.stderr, "Stored model loaded. Statement classifier initialized for prediction." def make_data(self, trainfile_name): print >> sys.stderr, "Reading data.." train_data = [ tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8") ] shuffle(train_data) train_labels, train_clauses = zip(*train_data) train_labels = [tl.lower() for tl in train_labels] tagset = list(set(train_labels)) if not self.tag_index: self.tag_index = {l: i for (i, l) in enumerate(tagset)} Y = numpy.asarray([self.tag_index[label] for label in train_labels]) if self.modeltype == "mlp": X = numpy.asarray([ numpy.mean(self.rep_reader.get_clause_rep(clause.lower()), axis=0) for clause in train_clauses ]) else: X = numpy.asarray([ self.rep_reader.get_clause_rep(clause.lower()) for clause in train_clauses ]) return X, Y, len(tagset) def classify(self, classifier, X): output_func = classifier.get_output_func() predictions = [numpy.argmax(output_func(x)) for x in X] return predictions def fit_model(self, X, Y, num_classes): if self.modeltype == "mlp": classifier = MLP(self.input_size, self.hidden_sizes, num_classes) else: classifier = RNN(self.input_size, self.hidden_size, num_classes) train_func = classifier.get_train_func(self.learning_rate) for num_iter in range(self.max_iter): for x, y in zip(X, Y): train_func(x, y) return classifier def train(self, trainfile_name): train_X, train_Y, num_classes = self.make_data(trainfile_name) accuracies = [] fscores = [] if self.cv: num_points = train_X.shape[0] fol_len = num_points / self.folds rem = num_points % self.folds X_folds = numpy.split(train_X, self.folds) if rem == 0 else numpy.split( train_X[:-rem], self.folds) Y_folds = numpy.split(train_Y, self.folds) if rem == 0 else numpy.split( train_Y[:-rem], self.folds) for i in range(self.folds): train_folds_X = [] train_folds_Y = [] for j in range(self.folds): if i != j: train_folds_X.append(X_folds[j]) train_folds_Y.append(Y_folds[j]) train_fold_X = numpy.concatenate(train_folds_X) train_fold_Y = numpy.concatenate(train_folds_Y) classifier = self.fit_model(train_fold_X, train_fold_Y, num_classes) predictions = self.classify(classifier, X_folds[i]) accuracy, weighted_fscore, _ = self.evaluate( Y_folds[i], predictions) accuracies.append(accuracy) fscores.append(weighted_fscore) accuracies = numpy.asarray(accuracies) fscores = numpy.asarray(fscores) print >> sys.stderr, "Accuracies:", accuracies print >> sys.stderr, "Average: %0.4f (+/- %0.4f)" % ( accuracies.mean(), accuracies.std() * 2) print >> sys.stderr, "Fscores:", fscores print >> sys.stderr, "Average: %0.4f (+/- %0.4f)" % ( fscores.mean(), fscores.std() * 2) self.classifier = self.fit_model(train_X, train_Y, num_classes) cPickle.dump(classifier, open(self.trained_model_name, "wb")) #pickle.dump(tagset, open(self.stored_tagset, "wb")) print >> sys.stderr, "Done" def evaluate(self, y, pred): accuracy = float(sum([c == p for c, p in zip(y, pred)])) / len(pred) num_gold = {} num_pred = {} num_correct = {} for c, p in zip(y, pred): if c in num_gold: num_gold[c] += 1 else: num_gold[c] = 1 if p in num_pred: num_pred[p] += 1 else: num_pred[p] = 1 if c == p: if c in num_correct: num_correct[c] += 1 else: num_correct[c] = 1 fscores = {} for p in num_pred: precision = float( num_correct[p]) / num_pred[p] if p in num_correct else 0.0 recall = float( num_correct[p]) / num_gold[p] if p in num_correct else 0.0 fscores[p] = 2 * precision * recall / ( precision + recall) if precision != 0 and recall != 0 else 0.0 weighted_fscore = sum([ fscores[p] * num_gold[p] if p in num_gold else 0.0 for p in fscores ]) / sum(num_gold.values()) return accuracy, weighted_fscore, fscores