def load_raw(self): """Load for sklearn training""" labels = [] # string labels examples = [] # examples as strings no_labels = [] # docs with no labels # document id -> label mapping doc2label = i2b2.parse_standoff(self.annot_xml, self.disease, self.judgement) for f in os.listdir(self.corpus_path): doc_id = f.split('.')[0] file_path = os.path.join(self.corpus_path, f) file_feat_list = utils.read_cuis(file_path) # no labels for some documents for some reason if doc_id in doc2label: string_label = doc2label[doc_id] int_label = LABEL2INT[string_label] labels.append(int_label) examples.append(' '.join(file_feat_list)) else: no_labels.append(doc_id) print '%d documents with no labels for %s/%s in %s' \ % (len(no_labels), self.disease, self.judgement, self.annot_xml.split('/')[-1]) return examples, labels
def make_token_alphabet(self): """Map tokens (CUIs) to integers""" # count tokens in the entire corpus token_counts = collections.Counter() for f in os.listdir(self.corpus_path): file_path = os.path.join(self.corpus_path, f) file_feat_list = None if self.use_cuis: file_feat_list = utils.read_cuis(file_path) else: file_feat_list = utils.read_tokens(file_path) token_counts.update(file_feat_list) # now make alphabet (high freq tokens first) index = 1 self.token2int['oov_word'] = 0 outfile = open(ALPHABET_FILE, 'w') for token, count in token_counts.most_common(): if count > self.min_token_freq: outfile.write('%s|%s\n' % (token, count)) self.token2int[token] = index index = index + 1 # pickle alphabet pickle_file = open(self.alphabet_pickle, 'wb') pickle.dump(self.token2int, pickle_file)
def __init__(self, corpus_path, max_tokens_in_file): """Load documents as strings""" self.samples = [] for f in os.listdir(corpus_path): file_path = os.path.join(corpus_path, f) file_feat_list = utils.read_cuis(file_path) if len(file_feat_list) < max_tokens_in_file: self.samples.append(' '.join(file_feat_list))
def load(self, maxlen=float('inf'), tokens_as_set=True): """Convert examples into lists of indices for keras""" labels = [] # int labels examples = [] # examples as int sequences no_labels = [] # docs with no labels # document id -> label mapping doc2label = i2b2.parse_standoff(self.annot_xml, self.disease, self.judgement) # load examples and labels for f in os.listdir(self.corpus_path): doc_id = f.split('.')[0] file_path = os.path.join(self.corpus_path, f) file_ngram_list = None if self.use_cuis == True: file_feat_list = utils.read_cuis(file_path) else: file_feat_list = utils.read_tokens(file_path) example = [] if tokens_as_set: file_feat_list = set(file_feat_list) for token in file_feat_list: if token in self.token2int: example.append(self.token2int[token]) else: example.append(self.token2int['oov_word']) if len(example) > maxlen: example = example[0:maxlen] # no labels for some documents for some reason if doc_id in doc2label: string_label = doc2label[doc_id] int_label = LABEL2INT[string_label] labels.append(int_label) examples.append(example) else: no_labels.append(doc_id) print('%d documents with no labels for %s/%s in %s' \ % (len(no_labels), self.disease, self.judgement, self.annot_xml.split('/')[-1])) return examples, labels
def load_vectorized(self, exclude, maxlen=float('inf')): """Same as above but labels are vectors""" labels = [] # int labels examples = [] # examples as int sequences no_labels = [] # docs with no labels # document id -> vector of labels doc2labels = i2b2.parse_standoff_vectorized(self.annot_xml, self.judgement, exclude) # load examples and labels for f in os.listdir(self.corpus_path): doc_id = f.split('.')[0] file_path = os.path.join(self.corpus_path, f) file_feat_list = utils.read_cuis(file_path) example = [] # TODO: use unique tokens or not? for token in set(file_feat_list): if token in self.token2int: example.append(self.token2int[token]) else: example.append(self.token2int['oov_word']) if len(example) > maxlen: example = example[0:maxlen] # no labels for some documents for some reason if doc_id in doc2labels: label_vector = doc2labels[doc_id] labels.append(label_vector) examples.append(example) else: no_labels.append(doc_id) print '%d documents with no labels for %s/%s in %s' \ % (len(no_labels), self.disease, self.judgement, self.annot_xml.split('/')[-1]) return examples, labels