def __init__(self, K, corpus: Corpus, out='.'): """ Arguments: K: Number of topics corpus: A length of documents, documents represent as a class. Because doc length varies with each other, it's not a D*M matrix. """ self.out = out # folder to save experiments self.C = corpus.C self.generator = Corpus.generator_full_batch(corpus) self.D = corpus.D self.K = K self.T = corpus.T # different data types number self.W = corpus.W # different words number self.alpha = np.random.gamma( 1, 10, self.K) # hyperparameter for prior on weight vectors theta self.iota = np.random.gamma( 1 + 0.001, 0.001, self.T) # hyperparameter for prior on type vectors beta self.zeta = np.random.gamma( 2, 100, (self.W, self.K)) # hyperparameter for prior on type vectors eta self.alpha_sum = np.sum(self.alpha) # scalar value self.iota_sum = np.sum(self.iota) # scalar value self.zeta_sum = np.sum(self.zeta, axis=0) # sum over w, K dimensional self.tau = np.random.gamma(2, 0.5, self.K) # variational parameters self.lambda_ = np.zeros(self.D) self.m_ = np.ones(self.K) self.s = np.ones(self.K) self.exp_g = np.random.normal(size=self.D) self.exp_z_avg = np.zeros((self.D, self.K)) self.exp_q_z = 0 # token variables self.exp_n = np.random.rand(self.D, self.K) self.exp_m = np.random.rand(self.T, self.K) self.exp_p = np.random.rand(self.T, self.W, self.K) for d in range(self.D): self.exp_n[d] /= np.sum(self.exp_n[d]) for t in range(self.T): self.exp_m[t] /= np.sum(self.exp_m[t]) for t in range(self.T): for w in range(self.W): self.exp_p[t, w] /= np.sum(self.exp_p[t, w]) self.exp_n_sum = np.sum( self.exp_n, axis=1) # sum over k, exp_n is [D K] dimensionality self.exp_m_sum = np.sum( self.exp_m, axis=0) # sum over t, exp_m is [T K] dimensionality self.exp_p_sum = np.sum( self.exp_p, axis=1) # sum over w, exp_p is [T W K] dimensionality self.lasso = LogisticRegression() # Model parameters self.parameters = [ 'alpha', 'iota', 'zeta', 'gamma', 'm_', 's', 'W', 'T' ]
def train_cv(): kf = StratifiedKFold(5, shuffle=True, random_state=42) folder = '/Users/cuent/Downloads/processed_new/delete1' corpus = Corpus.read_corpus_from_directory(folder + "/train") for train_index, test_index in kf.split(corpus, corpus.labels): for i in train_index: corpus.dataset[i].train = True for i in test_index: corpus.dataset[i].train = False
def predict(self, corpus, max_iter=100): self.D = corpus.D self.C = corpus.C self.generator = Corpus.generator_full_batch(corpus) self.batchsize = self.D # performa a full batch self.exp_n = np.random.rand(self.D, self.K) self.exp_m = np.random.rand(self.T, self.K) self.exp_p = np.random.rand(self.T, self.W, self.K) for d in range(self.D): self.exp_n[d] /= np.sum(self.exp_n[d]) for t in range(self.T): self.exp_m[t] /= np.sum(self.exp_m[t]) for t in range(self.T): for w in range(self.W): self.exp_p[t, w] /= np.sum(self.exp_p[t, w]) self.exp_n_sum = np.sum( self.exp_n, axis=1) # sum over k, exp_n is [D K] dimensionality self.exp_m_sum = np.sum( self.exp_m, axis=0) # sum over t, exp_m is [T K] dimensionality self.exp_p_sum = np.sum( self.exp_p, axis=1) # sum over w, exp_p is [T W K] dimensionality self.exp_z_avg = np.zeros((self.D, self.K)) self.exp_q_z = 0 self.exp_g = np.ones(self.D) self.lambda_ = np.zeros(self.D) elbo = [100, 0] iter = 0 for i, d in enumerate(self.generator): batch_patient, batch_i, M = d self.gamma = { pat.patient_id: np.random.rand(len(pat.words_dict), self.K) for pat in batch_patient } for pat in batch_patient: pat.y = -1 pat.isMissingLabel = True while iter < max_iter: self.CVB0_test(batch_patient, iter) pred_result = self.lasso.predict_proba(self.exp_n)[:, 1] avg_pr = average_precision_score(self.y_test, pred_result) fpr, tpr, threshold = roc_curve(self.y_test, pred_result, pos_label=1) roc_auc_rf = auc(fpr, tpr) pickle.dump([self.y_test, pred_result], open('prediction_y_p_test.pkl', 'wb')) if (iter + 1) % 50 == 0: self.save_model(iter + 1) iter += 1 if not iter < max_iter: break
def predict(self, corpus, max_iter=500): self.D = corpus.D self.C = corpus.C self.generator = Corpus.generator_full_batch(corpus) self.lambda_ = np.zeros(self.D) # self.exp_g = np.random.normal(size=self.D) self.exp_z_avg = np.zeros((self.D, self.K)) self.exp_q_z = 0 self.exp_n = np.random.rand(self.D, self.K) # self.exp_m = np.random.rand(self.T, self.K) # self.exp_p = np.random.rand(self.T, self.W, self.K) for d in range(self.D): self.exp_n[d] /= np.sum(self.exp_n[d]) # for t in range(self.T): # self.exp_m[t] /= np.sum(self.exp_m[t]) # for t in range(self.T): # for w in range(self.W): # self.exp_p[t, w] /= np.sum(self.exp_p[t, w]) self.exp_n_sum = np.sum(self.exp_n, axis=1) # sum over k, exp_n is [D K] dimensionality # self.exp_m_sum = np.sum(self.exp_m, axis=0) # sum over t, exp_m is [T K] dimensionality # self.exp_p_sum = np.sum(self.exp_p, axis=1) # sum over w, exp_p is [T W K] dimensionality # elbo = [100, 0] iter = 1 for i, d in enumerate(self.generator): batch_patient, batch_i, M = d self.gamma = {pat.patient_id: np.random.rand(len(pat.words_dict), self.K) for pat in batch_patient} for pat in batch_patient: pat.y = -1 pat.isMissingLabel = True while iter <= max_iter: self.CVB0_test(batch_patient, iter) # elbo.append(self.ELBO()) # print("%s elbo %s diff %s" % (iter, elbo[-1], np.abs(elbo[-1] - elbo[-2]))) # if (iter + 1) % 50 == 0: # self.save_model(iter + 1) n = self.exp_z_avg.dot(self.m_) d = np.array([1 + np.sqrt(np.dot(z_avg.dot(np.diag(self.s)), z_avg)) for z_avg in self.exp_z_avg]) p = norm.cdf(n / d) y = np.random.binomial(1, p).flatten() avg_pr = average_precision_score(self.y_test, p) fpr, tpr, threshold = roc_curve(self.y_test, p, pos_label=1) roc_auc_rf = auc(fpr, tpr) print("it-%d: AUC %.2f - APRC %.2f" % (iter, roc_auc_rf, avg_pr)) iter += 1 # save prediction pickle.dump((self.y_test, p), open(os.path.join(self.out, 'prediction_y_p_%d.pkl' % self.K), 'wb'))
def infer(self, corpus:Corpus, infer_only=False, predict=False, max_iter=500, tol=1e-4): elbo = [100, 0] iter = 0 diff = 1 # init containers self.C = corpus.C self.D = corpus.D self.init_variational_params() self.init_expectations(infer_only) # sample a full batch of corpus generator = Corpus.generator_full_batch(corpus) # init gamma uniformly for i, d in enumerate(generator): batch_patient, batch_i, M = d self.gamma = {pat.patient_id: np.random.rand(len(pat.words_dict), self.K) for pat in batch_patient} while iter < max_iter and diff > tol: for i, d in enumerate(generator): batch_patient, batch_index, M = d old_gamma = self.gamma.copy() # infer topics self.CVB0(batch_patient, infer_only) # test convergence # elbo.append(self.ELBO()) iter += 1 diff = np.mean([np.mean(np.abs(old_gamma[i] - self.gamma[i])) for i in range(len(batch_patient))]) print("it %d. diff: %.5f " % (iter, diff)) # predict if predict: self.predict(corpus.labels) if (iter + 1) % 100 == 0: self.save_model(iter + 1) if iter < max_iter and diff > tol: break pickle.dump(elbo, open(os.path.join(self.out, 'elbo_training.pkl'), 'wb')) pickle.dump(self.gamma, open(os.path.join(self.out, 'gamma_train.pkl'), 'wb')) return self.gamma
with h5py.File(model_path, 'r') as hf: for param in self.parameters: if param == 'gamma': pass # self.gamma = dd.io.load(gamma_file) else: self.__setattr__(param, hf[param][...]) if __name__ == '__main__': train_dir = "/Users/cuent/Downloads/processed_new/mv/out/cv1/train" test_dir = "/Users/cuent/Downloads/processed_new/mv/out/cv1/test" # train_dir = "/Users/cuent/Downloads/processed_new/single" # test_dir = "/Users/cuent/Downloads/processed_new/single" c_train = Corpus.read_corpus_from_directory(train_dir) c_test = Corpus.read_corpus_from_directory(test_dir) y_train_true = np.array([p[0].y for p in c_train]) y_test_true = np.array([p[0].y for p in c_test]) K = 50 mixehr = MixEHR(K, c_train) mixehr.y_train = y_train_true mixehr.y_test = y_test_true mixehr.inference_svb(max_iter=500, save_every=100) mixehr.load_model("model_smixehr_k50_it500.hdf5") mixehr.predict(c_test, max_iter=300)
open(os.path.join(self.out, 'gamma%d.pkl' % iter), 'rb')) self.exp_n = pickle.load( open(os.path.join(self.out, 'exp_n_%d.pkl' % iter), 'rb')) self.exp_m = pickle.load( open(os.path.join(self.out, 'exp_m_%d.pkl' % iter), 'rb')) self.exp_p = pickle.load( open(os.path.join(self.out, 'exp_p_%d.pkl' % iter), 'rb')) else: self.__setattr__(param, hf[param][...]) self.lasso.fit(self.exp_n, self.y_train) if __name__ == '__main__': # c_train = Corpus.read_corpus_from_directory("../split/train") # c_test = Corpus.read_corpus_from_directory("../split/test") c_train = Corpus.read_corpus_from_directory("../dataset/cv1/train") c_test = Corpus.read_corpus_from_directory("../dataset/cv1/test") y_train_true = np.array([p[0].y for p in c_train]) y_test_true = np.array([p[0].y for p in c_test]) K = 100 mixehr = MixEHR(K, c_train) mixehr.y_train = y_train_true mixehr.y_test = y_test_true # exp_g, gamma = code.inference_svb() mixehr.load_model("model_mixehr_100_100.hdf5") mixehr.predict(c_test)
'word_ngrams': (1,2), 'word_topk':100, 'pos_ngrams':(1,2), 'word_lemma':True, 'word_entities':False, 'word_punct':False, 'pos_detailed': False, 'char_punct': False, 'char_lower':False, 'coref_n': 2, 'coref_pos_types' :['DT', 'NN', 'NNP', 'NNPS', 'NNS', 'PRP', 'PRP$'], 'coref_dependencies':['dobj', 'nsubj', 'nsubjpass', 'pobj', 'poss'], 'coref_group': True } corpus = Corpus() for i in range(N): print('Loading document {} of {}'.format(i , N)) doc = Document(text = sample_data.body.iloc[i], author= sample_data.author.iloc[i], category = sample_data.primary_tags.iloc[i], spacy_model=nlp) corpus.documents.append(doc) corpus.init_docs(**corpus_params) corpus.build_data() corpus.save('data/full_corpus_100.pkl')
def extract_docs(corpus_dir, vocab, out_dir): corpus, meta = Corpus.read_corpus_from_directory(corpus_dir, True) type_ids, vocab_ids = meta type_ids_rev = {} vocab_ids_rev = {} for k in type_ids: type_ids_rev[type_ids[k]] = k for k in vocab_ids: vocab_ids_rev[vocab_ids[k]] = k ignored_words = 0 docs_only = [] docs_freq = [] patients = [] ids = [] responses = [] pbar = tqdm(corpus) for c, _ in pbar: patient_id = c.index i = c.patient_id label = c.y words = c.words_dict flat_words = [] flat_words_freq = [] for (type_id, word_id), freq in words.items(): type_id = type_ids_rev[type_id] word_id = vocab_ids_rev[word_id] flat_words_freq.append("%d:%d" % (word_id, freq)) vocab_type = vocab[type_id][['pheId', 'pheName']] w = vocab_type.loc[vocab_type['pheId'] == word_id]['pheName'].tolist() if len(w) > 1: print(w) w = freq * w if len(w) > 0: flat_words.extend(w) else: print(type_id, word_id) ignored_words += 1 docs_only.append(' '.join(flat_words)) docs_freq.append("%d %s" % (len(flat_words_freq), ' '.join(flat_words_freq))) ids.append(i) patients.append(patient_id) responses.append(label) if ignored_words > 0: print("Couldn't find %d words." % ignored_words) data = {'mixehr_id': ids, 'patient_id': patients, 'label': responses, 'text': docs_only} # save data mixehr_data = pd.DataFrame(data) mixehr_data.to_csv(os.path.join(out_dir, 'mix_raw.csv'), index=False) # save labels only mixehr_data[['label']].to_csv(os.path.join(out_dir, 'mix_label.csv'), index=False, header=False) # save slda format pd.DataFrame({'text': docs_freq}).to_csv(os.path.join(out_dir, 'mix_word_freq.csv'), index=False, header=False) # save vocabulary pickle.dump(vocab, open(os.path.join(out_dir, 'vocab.pkl'), 'wb')) # save id lookup pickle.dump((type_ids, vocab_ids), open(os.path.join(out_dir, 'id_mixehr_seq.pkl'), 'wb')) pickle.dump((type_ids_rev, vocab_ids_rev), open(os.path.join(out_dir, 'id_seq_mixehr.pkl'), 'wb'))
for param in self.parameters: self.__setattr__(param, hf[param][...]) def train_cv(): kf = StratifiedKFold(5, shuffle=True, random_state=42) folder = '/Users/cuent/Downloads/processed_new/delete1' corpus = Corpus.read_corpus_from_directory(folder + "/train") for train_index, test_index in kf.split(corpus, corpus.labels): for i in train_index: corpus.dataset[i].train = True for i in test_index: corpus.dataset[i].train = False if __name__ == '__main__': folder = '/Users/cuent/Downloads/processed_new/delete1' c_train = Corpus.read_corpus_from_directory(folder + "/train") # c_test = Corpus.read_corpus_from_directory(folder + "/test") # c_train = Corpus.read_corpus_from_directory("../split/train") # c_test = Corpus.read_corpus_from_directory("../split/test") K = 21 mixehr = MixEHR(K, c_train.T, c_train.W) gamma = mixehr.infer(c_train, predict=True) # code.predict(c_test) # train_cv()