def __init__(self): self.model = Model('model.pkl') self.path = '../i2b2_data/data/' self.identifiers = { 'DOS': 'do', 'UNIT': 'do', 'FREQ': 'f', 'PER': 'du' } label_files = os.listdir(self.path + 'annotations_ground_truth/pool/') self.label_file_dict = { name.split('.')[0]: name for name in label_files }
def view_tree(sentence): model = Model('../model.pkl') ne_labels = model.predict(sentence) sent = nltk.word_tokenize(sentence) pos = nltk.pos_tag(sent) sent = list(zip([x[0] for x in pos], [x[1] for x in pos], ne_labels[0])) # sent = convert_to_IOB(sent) print(sent) text = '' for t, p, n in sent: text += t + ' ' + p + ' ' + n[1] + '\n' tree = nltk.chunk.conllstr2tree( text, chunk_types=['DOS', 'UNIT', 'WHO', 'O', 'FREQ', 'PER']) tree.draw()
def __init__(self, parent=None): super(App, self).__init__(parent) self.setupUi(self) self.model = Model() self.trainer = Trainer() self.pushButton.clicked.connect(self.__extract__) self.pushButton_2.clicked.connect(self.__load_text__) self.pushButton_3.clicked.connect(self.__train_model__) self.pushButton_4.clicked.connect(self.__load_model__) self.css = ''' label { font-style: normal; padding-right: 4 px; } ''' self.text_doc = QtGui.QTextDocument() self.text_doc.setDefaultStyleSheet(self.css) self.text_doc.setHtml('<body></body>') self.textEdit.setDocument(self.text_doc) self.color_dict = { 'O': '#FFFFFF', 'DOS': '#EE964B', 'UNIT': '#F95738', 'WHO': '#8F78AD', 'FREQ': '#D4BA6A', 'DUR': '#BBCD67' } self.label.setText(f"<html><head/><body><p><span style=\" background-color:" f"{self.color_dict['DOS']};\">Dosage</span></p></body></html>") self.label_2.setText(f"<html><head/><body><p><span style=\" background-color:" f"{self.color_dict['UNIT']};\">Unit</span></p></body></html>") self.label_3.setText(f"<html><head/><body><p><span style=\" background-color:" f"{self.color_dict['WHO']};\">Who</span></p></body></html>") self.label_4.setText(f"<html><head/><body><p><span style=\" background-color:" f"{self.color_dict['FREQ']};\">Frequency</span></p></body></html>") self.label_5.setText(f"<html><head/><body><p><span style=\" background-color:" f"{self.color_dict['DUR']};\">Period</span></p></body></html>")
def convert_to_ne_tree(sentence): """takes a string sentence as input and returns a tree structure with named entities grouped in subtrees returns a bracket notation tree """ model = Model() ne_labels = model.predict(sentence) sent = nltk.word_tokenize(sentence) pos = nltk.pos_tag(sent) sent = list(zip([x[0] for x in pos], [x[1] for x in pos], ne_labels)) sent = convert_to_IOB(sent) text = '' for t, p, n in sent: text += t + ' ' + p + ' ' + n + '\n' tree = nltk.chunk.conllstr2tree( text, chunk_types=['DOS', 'UNIT', 'WHO', 'O', 'FREQ', 'PER']) return tree
class App(QtWidgets.QMainWindow, design.Ui_MainWindow): """ MVC controller. """ def __init__(self, parent=None): super(App, self).__init__(parent) self.setupUi(self) self.model = Model() self.trainer = Trainer() self.pushButton.clicked.connect(self.__extract__) self.pushButton_2.clicked.connect(self.__load_text__) self.pushButton_3.clicked.connect(self.__train_model__) self.pushButton_4.clicked.connect(self.__load_model__) self.css = ''' label { font-style: normal; padding-right: 4 px; } ''' self.text_doc = QtGui.QTextDocument() self.text_doc.setDefaultStyleSheet(self.css) self.text_doc.setHtml('<body></body>') self.textEdit.setDocument(self.text_doc) self.color_dict = { 'O': '#FFFFFF', 'DOS': '#EE964B', 'UNIT': '#F95738', 'WHO': '#8F78AD', 'FREQ': '#D4BA6A', 'DUR': '#BBCD67' } self.label.setText(f"<html><head/><body><p><span style=\" background-color:" f"{self.color_dict['DOS']};\">Dosage</span></p></body></html>") self.label_2.setText(f"<html><head/><body><p><span style=\" background-color:" f"{self.color_dict['UNIT']};\">Unit</span></p></body></html>") self.label_3.setText(f"<html><head/><body><p><span style=\" background-color:" f"{self.color_dict['WHO']};\">Who</span></p></body></html>") self.label_4.setText(f"<html><head/><body><p><span style=\" background-color:" f"{self.color_dict['FREQ']};\">Frequency</span></p></body></html>") self.label_5.setText(f"<html><head/><body><p><span style=\" background-color:" f"{self.color_dict['DUR']};\">Period</span></p></body></html>") def __load_text__(self): file_path = QtWidgets.QFileDialog.getOpenFileName( QtWidgets.QFileDialog(), 'Open file', '~', "All files *")[0] if not file_path == '': with open(file_path) as file: text = file.read(-1) file.close() self.text_doc.setHtml('<body>' + text + '</body>') def __train_finished__(self, value): print('received result: ' + str(value)) self.pushButton_3.setText("Train") self.pushButton_3.setEnabled(True) def __train_model__(self): def __train_in_thread__(data_set): process = subprocess.Popen(['python', 'model/crf_trainer.py', data_set] , stdout=subprocess.PIPE) out, _ = process.communicate() process.wait() self.__train_finished__(out) data_set = QtWidgets.QFileDialog.getOpenFileName( QtWidgets.QFileDialog(), 'Select data set', '~', "*.tsv")[0] thread = threading.Thread(target=__train_in_thread__, args=(data_set,)) thread.start() self.pushButton_3.setText("Training...") self.pushButton_3.setEnabled(False) def __load_model__(self): model_file = QtWidgets.QFileDialog.getOpenFileName( QtWidgets.QFileDialog(), 'Select model', '~', "*.pkl")[0] if model_file: self.model.load(model_file) def __extract__(self): text = self.textEdit.toPlainText() parsed_sentences = self.model.predict(text) self.textEdit.clear() labeled_text = [] for sentence in parsed_sentences: labeled_text += sentence self.textEdit.setDocument(self.__get_rich_text(labeled_text)) def __get_rich_text(self, parsed_text): rich_text = f'<body>' length = len(parsed_text) for i, (word, label) in enumerate(parsed_text): if label == 'O': rich_text += f'<label>{word}</label>' else: label = label.split('-')[1] rich_text += f'<nobr><label style=\"background-color:' \ f'{self.color_dict[label]}\">{word}</label></nobr>' # to keep the full stop next to the last word if i < length - 2: rich_text += f"<label> </label>" rich_text += f'</body>' self.text_doc.setHtml(rich_text) return self.text_doc
def __init__(self, model_file='model/model.pkl'): self.model = Model()
class Trainer: """ Generates a CRF model given a data set of labeled words. """ def __init__(self, model_file='model/model.pkl'): self.model = Model() def generate_model(self, data_set): """ Generates a CRF model given the data set. It saves the model to disk with the name 'model.pkl'. :param data_set: Path to the labeled data set. :return: Performance results. """ x_train, y_train, x_test, y_test = self.gen_test_train(data_set) results = self.gen_model(x_train, y_train, x_test, y_test) return results def gen_model(self, x_train, y_train, x_test, y_test): for i in range(len(y_train)): for j in range(len(y_train[i])): y_train[i][j] = y_train[i][j].replace('B-', '') y_train[i][j] = y_train[i][j].replace('O-', '') y_train[i][j] = y_train[i][j].replace('I-', '') for i in range(len(y_test)): for j in range(len(y_test[i])): y_test[i][j] = y_test[i][j].replace('B-', '') y_test[i][j] = y_test[i][j].replace('O-', '') y_test[i][j] = y_test[i][j].replace('I-', '') labels = ['DOS', 'UNIT', 'FREQ', 'DUR', 'WHO'] # labels = ['O-DOS', 'B-DOS', 'I-UNIT', 'B-UNIT', 'O-UNIT', 'I-FREQ', 'B-FREQ', 'O-FREQ', 'I-DUR', 'B-DUR', 'O-DUR', 'I-WHO', 'B-WHO', 'O-WHO'] # labels = ['m', 'r', 'f', 'do', 'du', 'mo'] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rand_search = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rand_search.fit(x_train, y_train) crf = rand_search.best_estimator_ y_prediction = crf.predict(x_test) # group B and I results sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) joblib.dump(crf, 'model.pkl') precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='micro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='micro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='micro') print('MICRO') print(precision, recall, f1) precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='macro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='macro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='macro') print('MACRO') print(precision, recall, f1) return metrics.flat_classification_report(y_test, y_prediction, labels=sorted_labels, digits=3) def validate_performance(self, test_set): sentences = self.__load_corpus__(test_set) y_test = [self.model.sentence2labels(s) for s in sentences] y_prediction = [] for i, sent in enumerate(sentences): new_sent = ' '.join([word[0] for word in sent]) prediction = self.model.predict(new_sent) new_prediction = [] if len(prediction) > 1: for p in prediction: new_prediction += [p1 for p1 in p] # print(prediction) # print(new_prediction) prediction = new_prediction else: prediction = prediction[0] try: pred = [w[1] for w in prediction] except Exception: print(prediction) return # if len(pred) != len(y_test[i]): # print(sent) # print(new_sent) # print(y_test[i]) # print(len(y_test[i])) # print(pred) # print(len(pred)) y_prediction.append(pred) labels = [ 'O-DOS', 'B-DOS', 'I-UNIT', 'B-UNIT', 'O-UNIT', 'I-FREQ', 'B-FREQ', 'O-FREQ', 'I-DUR', 'B-DUR', 'O-DUR', 'I-WHO', 'B-WHO', 'O-WHO' ] for i in range(len(y_prediction)): for j in range(len(y_prediction[i])): y_prediction[i][j] = y_prediction[i][j].replace('B-', '') y_prediction[i][j] = y_prediction[i][j].replace('O-', '') y_prediction[i][j] = y_prediction[i][j].replace('I-', '') for i in range(len(y_test)): for j in range(len(y_test[i])): y_test[i][j] = y_test[i][j].replace('B-', '') y_test[i][j] = y_test[i][j].replace('O-', '') y_test[i][j] = y_test[i][j].replace('I-', '') labels = ['DOS', 'UNIT', 'FREQ', 'DUR', 'WHO'] # labels = ['DOS', 'UNIT', 'WHO', 'DUR', 'FREQ'] sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='micro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='micro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='micro') print('MICRO') print(precision, recall, f1) precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='macro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='macro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='macro') print('MACRO') print(precision, recall, f1) print( metrics.flat_classification_report(y_test, y_prediction, labels=sorted_labels, digits=3)) def gen_test_train(self, corpus_file): """ Given the corpus file, it will generate the train and test sets whose cardinality is in a 90%/10% ratio. """ sentences = self.__load_corpus__(corpus_file) test_number = int(len(sentences) * 0.1) test = random.sample(sentences, test_number) train = list(sent for sent in sentences if sent not in test) x_train = [self.model.sentence2features(s) for s in train] y_train = [self.model.sentence2labels(s) for s in train] x_test = [self.model.sentence2features(s) for s in test] y_test = [self.model.sentence2labels(s) for s in test] return x_train, y_train, x_test, y_test @staticmethod def __write_tsv__(sentences, name): with open(name, 'w') as file: for sent in sentences: for line in sent: file.write(line[0] + '\t' + line[1] + '\t' + line[2] + '\n') file.write('\n') file.close() @staticmethod def __load_corpus__(corpus_file): with open(corpus_file, newline='', encoding='utf-8') as file: data = list(csv.reader(file, delimiter='\t')) sentences = [] sent = [] for line in data: if line == [] or line[0] == '': sentences.append(sent) sent = [] else: sent.append(line) sentences.append(sent) # for sent in sentences: # if sent and sent[0] == '0' and sent[1] == '0': # sent[0] = '.' # sent[1] = '.' return sentences
class I2B2Extractor: def __init__(self): self.model = Model('model.pkl') self.path = '../i2b2_data/data/' self.identifiers = { 'DOS': 'do', 'UNIT': 'do', 'FREQ': 'f', 'PER': 'du' } label_files = os.listdir(self.path + 'annotations_ground_truth/pool/') self.label_file_dict = { name.split('.')[0]: name for name in label_files } def parse_file(self, file): with open(file) as f: lines = f.readlines() for i, line in enumerate(lines): self.label_line(line, i + 1) def extract_all(self): training_set_folders = [ self.path + 'training.sets.released/' + str(ind) for ind in range(1, 11) ] for training_set in training_set_folders: for file in os.listdir(training_set): if file in self.label_file_dict.keys(): self.parse_file(training_set + '/' + file) return def write_output(self): pass def label_line(self, line, index): prediction = self.model.predict(line) if len(prediction) == 0: return prediction = prediction[0] print(prediction) entities = { 'm': ['nm'], 'do': [], 'mo': ['nm'], 'f': [], 'du': [], 'r': ['nm'], 'ln': ['nm'] } for i, (word, entity) in enumerate(prediction): if entity not in self.identifiers.keys(): continue entities[self.identifiers[entity]].append( (word, str(index) + ':' + str(i))) print(entities.items()) print() def generate_labeled_file(self, file): with open(self.path + 'train.test.released.8.17.09/' + file) as f: lines = f.readlines() words1 = [] locations = [] for i, line in enumerate(lines): words = line.split(' ') words = [w for w in words if w not in ['\n', '']] words = [ w.replace('\n', '') if w.endswith('\n') else w for w in words ] locations += [(i + 1, n) for n in range(len(words))] words1 += words text = ' '.join([s for s in words1]) sentences = nltk.sent_tokenize(text) words2 = [] for sent in sentences: words = sent.split(' ') words2 += words k = 0 new_sentences = [] for i, sent in enumerate(sentences): new_sent = sent.split(' ') j = 0 while j < len(new_sent): word = new_sent[j] if not word == words1[k]: if j == 0 and words1[k - 1].endswith(new_sent[0]): del new_sent[0] j -= 1 k -= 1 if words1[k].startswith(word): if len(new_sent) > j + 1: if words1[k].endswith(new_sent[j + 1]): new_sent[j] = words1[k] del new_sent[j + 1] k -= 1 elif words1[k].endswith(sentences[i + 1].split(' ')[0]): new_sent[j] = words1[k] elif (words1[k] + words1[k + 1]).endswith( sentences[i + 1].split(' ')[0]): new_sent[j] = words1[k] k += 1 j += 1 tmp = [] for word in new_sent: if '\t' in word: word = word.replace('\t', ' ') tmp.append(word) new_sentences.append(tmp) words2 = [] for sent in new_sentences: words2 += sent k = 0 labeled_sentences = [] for i, sent in enumerate(new_sentences): pos = nltk.pos_tag(sent) loc = locations[k:k + len(pos)] k += len(pos) labeled_sentences.append(list(zip(pos, loc))) labels = self.get_labels(file) with open('i2b2_corpus/' + file + '.tsv', 'w') as f: for sent in labeled_sentences: for tup in sent: if tup[1] in labels.keys(): f.write(tup[0][0] + '\t' + tup[0][1] + '\t' + labels[tup[1]] + '\n') else: f.write(tup[0][0] + '\t' + tup[0][1] + '\t' + 'O\n') f.write('\n') f.close() def get_labels(self, file): label_file = self.label_file_dict[file] with open(self.path + 'annotations_ground_truth/converted.noduplicates.sorted/' + label_file) as f: lines = f.readlines() labels = [] for line in lines: labels += line.split('||') result = {} for label in labels: entity = label.split('=')[0] if len(label.split(' ')) < 3: continue else: if ',' in label and '...' in label: positions = label.split('\" ')[1].split(',') for pos in positions: position = pos.split(' ')[-2:] line_no = int(position[0].split(':')[0]) start = int(position[0].split(':')[1]) stop = int(position[1].split(':')[1]) for i in range(start, stop + 1): result.update({tuple([line_no, i]): entity}) else: position = label.split(' ')[-2:] line_no = int(position[0].split(':')[0]) start = int(position[0].split(':')[1]) stop = int(position[1].split(':')[1]) for i in range(start, stop + 1): result.update({tuple([line_no, i]): entity}) return result def generate_corpus_files(self): training_set = self.path + 'train.test.released.8.17.09/' for file in os.listdir(training_set): if file in self.label_file_dict.keys(): self.generate_labeled_file(file) def concatenate_corpus_files(self): files = os.listdir('i2b2_corpus/') print(files) lines = [] for file in files: with open('i2b2_corpus/' + file) as f: lines.append(f.readlines()) with open('corpus_i2b2.tsv', 'w') as f: for text in lines: f.writelines(text) f.write('\n')
def chunk_sentence(sentence): model = Model('../model.pkl') sentence = fix_dashes_slashes(sentence) ne_labels = model.predict(sentence) sent = nltk.word_tokenize(sentence) pos = nltk.pos_tag(sent) sent = list(zip([x[0] for x in pos], [n[1] for n in ne_labels])) sent = convert_to_IOB(sent) for i, t in enumerate(sent): if t[0] == 'or': sent[i] = ('or', t[1] + '_or') if t[0] == 'to': sent[i] = ('to', t[1] + '_to') if t[0] == '-': sent[i] = ('-', t[1] + '_-') if t[0] == '/': sent[i] = ('/', t[1] + '_/') grammar = r""" DOS: {<DOS.*>+} UNIT: {<UNIT.*>+} FREQ: {<FREQ.*>+} PER: {<PER.*>+} WHO: {<WHO.*>+} O: {<O>+} O_or: {<O_or>} O_to: {<O_to>} O_-: {<O_->} O_/: {<O_/>} DOSAGE: {<DOS><UNIT>?<O_.*><DOS>?<UNIT>} DOSAGE: {<DOS><UNIT>} O: {<DOS>} O: {<UNIT>} """ cp = nltk.RegexpParser(grammar) print(sent) result = cp.parse(sent) for st in result.subtrees(lambda t: '_' in t.label()): st.set_label(st.label().split('_')[0]) for leafPos in result.treepositions('leaves'): result[leafPos] = result[leafPos][0] res = nltk.Tree('S', []) i = 0 while i < len(result): t = result[i] if t.label() == 'O': leaves = t.leaves() while t.label() == 'O': i += 1 try: t = result[i] except IndexError: break if t.label() == 'O': leaves += t.leaves() res.append(nltk.Tree('O', leaves)) continue res.append(t) i += 1 # res.draw() return res