def make_token_alphabet(self): """Map tokens (CUIs) to integers""" # count tokens in the entire corpus token_counts = collections.Counter() for f in os.listdir(self.corpus_path): file_path = os.path.join(self.corpus_path, f) file_feat_list = None if self.use_cuis: file_feat_list = utils.read_cuis(file_path) else: file_feat_list = utils.read_tokens(file_path) token_counts.update(file_feat_list) # now make alphabet (high freq tokens first) index = 1 self.token2int['oov_word'] = 0 outfile = open(ALPHABET_FILE, 'w') for token, count in token_counts.most_common(): if count > self.min_token_freq: outfile.write('%s|%s\n' % (token, count)) self.token2int[token] = index index = index + 1 # pickle alphabet pickle_file = open(self.alphabet_pickle, 'wb') pickle.dump(self.token2int, pickle_file)
def load_raw(self): """Load for sklearn training""" labels = [] # string labels examples = [] # examples as strings no_labels = [] # docs with no labels # document id -> label mapping doc2label = i2b2.parse_standoff(self.annot_xml, self.disease, self.judgement) for f in os.listdir(self.corpus_path): doc_id = f.split('.')[0] file_path = os.path.join(self.corpus_path, f) file_feat_list = None if self.use_cuis == True: file_feat_list = utils.read_cuis(file_path) else: file_feat_list = utils.read_tokens(file_path) # no labels for some documents for some reason if doc_id in doc2label: string_label = doc2label[doc_id] int_label = LABEL2INT[string_label] labels.append(int_label) examples.append(' '.join(file_feat_list)) else: no_labels.append(doc_id) print('%d documents with no labels for %s/%s in %s' \ % (len(no_labels), self.disease, self.judgement, self.annot_xml.split('/')[-1])) return examples, labels
def load(self, maxlen=float('inf'), tokens_as_set=True): """Convert examples into lists of indices for keras""" labels = [] # int labels examples = [] # examples as int sequences no_labels = [] # docs with no labels # document id -> label mapping doc2label = i2b2.parse_standoff(self.annot_xml, self.disease, self.judgement) # load examples and labels for f in os.listdir(self.corpus_path): doc_id = f.split('.')[0] file_path = os.path.join(self.corpus_path, f) file_ngram_list = None if self.use_cuis == True: file_feat_list = utils.read_cuis(file_path) else: file_feat_list = utils.read_tokens(file_path) example = [] if tokens_as_set: file_feat_list = set(file_feat_list) for token in file_feat_list: if token in self.token2int: example.append(self.token2int[token]) else: example.append(self.token2int['oov_word']) if len(example) > maxlen: example = example[0:maxlen] # no labels for some documents for some reason if doc_id in doc2label: string_label = doc2label[doc_id] int_label = LABEL2INT[string_label] labels.append(int_label) examples.append(example) else: no_labels.append(doc_id) print('%d documents with no labels for %s/%s in %s' \ % (len(no_labels), self.disease, self.judgement, self.annot_xml.split('/')[-1])) return examples, labels
parser.add_option("-d", "--debug", action="store_true", help="turn on debug mode") (options, args) = parser.parse_args() if len(args) != 2: parser.error("Please provide required arguments") if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.CRITICAL) training_file = args[0] training_sents = utils.read_tokens(training_file) test_file = args[1] test_sents = utils.read_tokens(test_file) model = create_model(training_sents) ## read sentences again because predict_tags(...) rewrites the tags sents = utils.read_tokens(training_file) predictions = predict_tags(sents, model) accuracy = utils.calc_accuracy(training_sents, predictions) print "Accuracy in training [%s sentences]: %s" % (len(sents), accuracy) ## read sentences again because predict_tags(...) rewrites the tags sents = utils.read_tokens(test_file) predictions = predict_tags(sents, model) accuracy = utils.calc_accuracy(test_sents, predictions)