예제 #1
0
    def __init__(self, K, corpus: Corpus, out='.'):
        """
        Arguments:
            K: Number of topics
            corpus: A length of documents, documents represent as a class. Because doc length varies with each other,
                it's not a D*M matrix.
        """
        self.out = out  # folder to save experiments

        self.C = corpus.C
        self.generator = Corpus.generator_full_batch(corpus)
        self.D = corpus.D
        self.K = K
        self.T = corpus.T  # different data types number
        self.W = corpus.W  # different words number

        self.alpha = np.random.gamma(
            1, 10, self.K)  # hyperparameter for prior on weight vectors theta
        self.iota = np.random.gamma(
            1 + 0.001, 0.001,
            self.T)  # hyperparameter for prior on type vectors beta
        self.zeta = np.random.gamma(
            2, 100,
            (self.W, self.K))  # hyperparameter for prior on type vectors eta
        self.alpha_sum = np.sum(self.alpha)  # scalar value
        self.iota_sum = np.sum(self.iota)  # scalar value
        self.zeta_sum = np.sum(self.zeta, axis=0)  # sum over w, K dimensional
        self.tau = np.random.gamma(2, 0.5, self.K)

        # variational parameters
        self.lambda_ = np.zeros(self.D)
        self.m_ = np.ones(self.K)
        self.s = np.ones(self.K)
        self.exp_g = np.random.normal(size=self.D)
        self.exp_z_avg = np.zeros((self.D, self.K))
        self.exp_q_z = 0

        # token variables
        self.exp_n = np.random.rand(self.D, self.K)
        self.exp_m = np.random.rand(self.T, self.K)
        self.exp_p = np.random.rand(self.T, self.W, self.K)
        for d in range(self.D):
            self.exp_n[d] /= np.sum(self.exp_n[d])
        for t in range(self.T):
            self.exp_m[t] /= np.sum(self.exp_m[t])
        for t in range(self.T):
            for w in range(self.W):
                self.exp_p[t, w] /= np.sum(self.exp_p[t, w])
        self.exp_n_sum = np.sum(
            self.exp_n, axis=1)  # sum over k, exp_n is [D K] dimensionality
        self.exp_m_sum = np.sum(
            self.exp_m, axis=0)  # sum over t, exp_m is [T K] dimensionality
        self.exp_p_sum = np.sum(
            self.exp_p, axis=1)  # sum over w, exp_p is [T W K] dimensionality
        self.lasso = LogisticRegression()

        # Model parameters
        self.parameters = [
            'alpha', 'iota', 'zeta', 'gamma', 'm_', 's', 'W', 'T'
        ]
예제 #2
0
def train_cv():
    kf = StratifiedKFold(5, shuffle=True, random_state=42)
    folder = '/Users/cuent/Downloads/processed_new/delete1'
    corpus = Corpus.read_corpus_from_directory(folder + "/train")
    for train_index, test_index in kf.split(corpus, corpus.labels):
        for i in train_index:
            corpus.dataset[i].train = True
        for i in test_index:
            corpus.dataset[i].train = False
예제 #3
0
    def predict(self, corpus, max_iter=100):
        self.D = corpus.D
        self.C = corpus.C
        self.generator = Corpus.generator_full_batch(corpus)
        self.batchsize = self.D  # performa a full batch
        self.exp_n = np.random.rand(self.D, self.K)
        self.exp_m = np.random.rand(self.T, self.K)
        self.exp_p = np.random.rand(self.T, self.W, self.K)
        for d in range(self.D):
            self.exp_n[d] /= np.sum(self.exp_n[d])
        for t in range(self.T):
            self.exp_m[t] /= np.sum(self.exp_m[t])
        for t in range(self.T):
            for w in range(self.W):
                self.exp_p[t, w] /= np.sum(self.exp_p[t, w])
        self.exp_n_sum = np.sum(
            self.exp_n, axis=1)  # sum over k, exp_n is [D K] dimensionality
        self.exp_m_sum = np.sum(
            self.exp_m, axis=0)  # sum over t, exp_m is [T K] dimensionality
        self.exp_p_sum = np.sum(
            self.exp_p, axis=1)  # sum over w, exp_p is [T W K] dimensionality
        self.exp_z_avg = np.zeros((self.D, self.K))
        self.exp_q_z = 0
        self.exp_g = np.ones(self.D)
        self.lambda_ = np.zeros(self.D)

        elbo = [100, 0]
        iter = 0
        for i, d in enumerate(self.generator):
            batch_patient, batch_i, M = d
            self.gamma = {
                pat.patient_id: np.random.rand(len(pat.words_dict), self.K)
                for pat in batch_patient
            }
            for pat in batch_patient:
                pat.y = -1
                pat.isMissingLabel = True

            while iter < max_iter:
                self.CVB0_test(batch_patient, iter)
                pred_result = self.lasso.predict_proba(self.exp_n)[:, 1]
                avg_pr = average_precision_score(self.y_test, pred_result)
                fpr, tpr, threshold = roc_curve(self.y_test,
                                                pred_result,
                                                pos_label=1)
                roc_auc_rf = auc(fpr, tpr)
                pickle.dump([self.y_test, pred_result],
                            open('prediction_y_p_test.pkl', 'wb'))
                if (iter + 1) % 50 == 0:
                    self.save_model(iter + 1)
                iter += 1
                if not iter < max_iter:
                    break
예제 #4
0
    def predict(self, corpus, max_iter=500):
        self.D = corpus.D
        self.C = corpus.C
        self.generator = Corpus.generator_full_batch(corpus)
        self.lambda_ = np.zeros(self.D)
        # self.exp_g = np.random.normal(size=self.D)
        self.exp_z_avg = np.zeros((self.D, self.K))
        self.exp_q_z = 0
        self.exp_n = np.random.rand(self.D, self.K)
        # self.exp_m = np.random.rand(self.T, self.K)
        # self.exp_p = np.random.rand(self.T, self.W, self.K)
        for d in range(self.D):
            self.exp_n[d] /= np.sum(self.exp_n[d])
        # for t in range(self.T):
        #     self.exp_m[t] /= np.sum(self.exp_m[t])
        # for t in range(self.T):
        #     for w in range(self.W):
        #         self.exp_p[t, w] /= np.sum(self.exp_p[t, w])
        self.exp_n_sum = np.sum(self.exp_n, axis=1) # sum over k, exp_n is [D K] dimensionality
        # self.exp_m_sum = np.sum(self.exp_m, axis=0) # sum over t, exp_m is [T K] dimensionality
        # self.exp_p_sum = np.sum(self.exp_p, axis=1) # sum over w, exp_p is [T W K] dimensionality

        # elbo = [100, 0]
        iter = 1
        for i, d in enumerate(self.generator):
            batch_patient, batch_i, M = d
            self.gamma = {pat.patient_id: np.random.rand(len(pat.words_dict), self.K) for pat in batch_patient}

            for pat in batch_patient:
                pat.y = -1
                pat.isMissingLabel = True
            while iter <= max_iter:
                self.CVB0_test(batch_patient, iter)
                # elbo.append(self.ELBO())
                # print("%s elbo %s diff %s" % (iter, elbo[-1], np.abs(elbo[-1] - elbo[-2])))
                # if (iter + 1) % 50 == 0:
                #     self.save_model(iter + 1)
                n = self.exp_z_avg.dot(self.m_)
                d = np.array([1 + np.sqrt(np.dot(z_avg.dot(np.diag(self.s)), z_avg)) for z_avg in self.exp_z_avg])
                p = norm.cdf(n / d)
                y = np.random.binomial(1, p).flatten()
                avg_pr = average_precision_score(self.y_test, p)
                fpr, tpr, threshold = roc_curve(self.y_test, p, pos_label=1)
                roc_auc_rf = auc(fpr, tpr)
                print("it-%d: AUC %.2f - APRC %.2f" % (iter, roc_auc_rf, avg_pr))

                iter += 1
            # save prediction
            pickle.dump((self.y_test, p), open(os.path.join(self.out, 'prediction_y_p_%d.pkl' % self.K), 'wb'))
예제 #5
0
    def infer(self, corpus:Corpus, infer_only=False, predict=False, max_iter=500, tol=1e-4):
        elbo = [100, 0]
        iter = 0
        diff = 1

        # init containers
        self.C = corpus.C
        self.D = corpus.D
        self.init_variational_params()
        self.init_expectations(infer_only)

        # sample a full batch of corpus
        generator = Corpus.generator_full_batch(corpus)

        # init gamma uniformly
        for i, d in enumerate(generator):
            batch_patient, batch_i, M = d
            self.gamma = {pat.patient_id: np.random.rand(len(pat.words_dict), self.K) for pat in batch_patient}

        while iter < max_iter and diff > tol:
            for i, d in enumerate(generator):
                batch_patient, batch_index, M = d
                old_gamma = self.gamma.copy()

                # infer topics
                self.CVB0(batch_patient, infer_only)

                # test convergence
                # elbo.append(self.ELBO())
                iter += 1
                diff = np.mean([np.mean(np.abs(old_gamma[i] - self.gamma[i])) for i in range(len(batch_patient))])
                print("it %d. diff: %.5f " % (iter, diff))

                # predict
                if predict:
                    self.predict(corpus.labels)

                if (iter + 1) % 100 == 0:
                    self.save_model(iter + 1)
                if iter < max_iter and diff > tol:
                    break
        pickle.dump(elbo, open(os.path.join(self.out, 'elbo_training.pkl'), 'wb'))
        pickle.dump(self.gamma, open(os.path.join(self.out, 'gamma_train.pkl'), 'wb'))

        return self.gamma
예제 #6
0
        with h5py.File(model_path, 'r') as hf:
            for param in self.parameters:
                if param == 'gamma':
                    pass
                    # self.gamma = dd.io.load(gamma_file)
                else:
                    self.__setattr__(param, hf[param][...])


if __name__ == '__main__':
    train_dir = "/Users/cuent/Downloads/processed_new/mv/out/cv1/train"
    test_dir = "/Users/cuent/Downloads/processed_new/mv/out/cv1/test"
    # train_dir = "/Users/cuent/Downloads/processed_new/single"
    # test_dir = "/Users/cuent/Downloads/processed_new/single"

    c_train = Corpus.read_corpus_from_directory(train_dir)
    c_test = Corpus.read_corpus_from_directory(test_dir)

    y_train_true = np.array([p[0].y for p in c_train])
    y_test_true = np.array([p[0].y for p in c_test])
    K = 50

    mixehr = MixEHR(K, c_train)

    mixehr.y_train = y_train_true
    mixehr.y_test = y_test_true

    mixehr.inference_svb(max_iter=500, save_every=100)
    mixehr.load_model("model_smixehr_k50_it500.hdf5")
    mixehr.predict(c_test, max_iter=300)
예제 #7
0
                        open(os.path.join(self.out, 'gamma%d.pkl' % iter),
                             'rb'))
                    self.exp_n = pickle.load(
                        open(os.path.join(self.out, 'exp_n_%d.pkl' % iter),
                             'rb'))
                    self.exp_m = pickle.load(
                        open(os.path.join(self.out, 'exp_m_%d.pkl' % iter),
                             'rb'))
                    self.exp_p = pickle.load(
                        open(os.path.join(self.out, 'exp_p_%d.pkl' % iter),
                             'rb'))
                else:
                    self.__setattr__(param, hf[param][...])
        self.lasso.fit(self.exp_n, self.y_train)


if __name__ == '__main__':
    # c_train = Corpus.read_corpus_from_directory("../split/train")
    # c_test = Corpus.read_corpus_from_directory("../split/test")
    c_train = Corpus.read_corpus_from_directory("../dataset/cv1/train")
    c_test = Corpus.read_corpus_from_directory("../dataset/cv1/test")
    y_train_true = np.array([p[0].y for p in c_train])
    y_test_true = np.array([p[0].y for p in c_test])
    K = 100
    mixehr = MixEHR(K, c_train)
    mixehr.y_train = y_train_true
    mixehr.y_test = y_test_true
    # exp_g, gamma = code.inference_svb()
    mixehr.load_model("model_mixehr_100_100.hdf5")
    mixehr.predict(c_test)
예제 #8
0
                 'word_ngrams': (1,2),
                 'word_topk':100,
                 'pos_ngrams':(1,2),
                 'word_lemma':True,
                 'word_entities':False,
                 'word_punct':False,
                 'pos_detailed': False,
                 'char_punct': False,
                 'char_lower':False,
                 'coref_n': 2,
                 'coref_pos_types' :['DT', 'NN', 'NNP', 'NNPS', 'NNS', 'PRP', 'PRP$'],
                 'coref_dependencies':['dobj', 'nsubj', 'nsubjpass', 'pobj', 'poss'],
                 'coref_group': True
}

corpus = Corpus()
for i in range(N):
    print('Loading document {} of {}'.format(i , N))
    doc = Document(text = sample_data.body.iloc[i],
                   author= sample_data.author.iloc[i],
                   category = sample_data.primary_tags.iloc[i],
                   spacy_model=nlp)
    corpus.documents.append(doc)


corpus.init_docs(**corpus_params)
corpus.build_data()


corpus.save('data/full_corpus_100.pkl')
예제 #9
0
def extract_docs(corpus_dir, vocab, out_dir):
    corpus, meta = Corpus.read_corpus_from_directory(corpus_dir, True)

    type_ids, vocab_ids = meta
    type_ids_rev = {}
    vocab_ids_rev = {}

    for k in type_ids:
        type_ids_rev[type_ids[k]] = k

    for k in vocab_ids:
        vocab_ids_rev[vocab_ids[k]] = k

    ignored_words = 0
    docs_only = []
    docs_freq = []
    patients = []
    ids = []
    responses = []

    pbar = tqdm(corpus)
    for c, _ in pbar:
        patient_id = c.index
        i = c.patient_id
        label = c.y
        words = c.words_dict
        flat_words = []
        flat_words_freq = []
        for (type_id, word_id), freq in words.items():
            type_id = type_ids_rev[type_id]
            word_id = vocab_ids_rev[word_id]

            flat_words_freq.append("%d:%d" % (word_id, freq))

            vocab_type = vocab[type_id][['pheId', 'pheName']]
            w = vocab_type.loc[vocab_type['pheId'] == word_id]['pheName'].tolist()
            if len(w) > 1:
                print(w)
            w = freq * w
            if len(w) > 0:
                flat_words.extend(w)
            else:
                print(type_id, word_id)
                ignored_words += 1
        docs_only.append(' '.join(flat_words))
        docs_freq.append("%d %s" % (len(flat_words_freq), ' '.join(flat_words_freq)))
        ids.append(i)
        patients.append(patient_id)
        responses.append(label)
    if ignored_words > 0:
        print("Couldn't find %d words." % ignored_words)

    data = {'mixehr_id': ids, 'patient_id': patients, 'label': responses, 'text': docs_only}

    # save data
    mixehr_data = pd.DataFrame(data)
    mixehr_data.to_csv(os.path.join(out_dir, 'mix_raw.csv'), index=False)
    # save labels only
    mixehr_data[['label']].to_csv(os.path.join(out_dir, 'mix_label.csv'), index=False, header=False)
    # save slda format
    pd.DataFrame({'text': docs_freq}).to_csv(os.path.join(out_dir, 'mix_word_freq.csv'), index=False, header=False)
    # save vocabulary
    pickle.dump(vocab, open(os.path.join(out_dir, 'vocab.pkl'), 'wb'))
    # save id lookup
    pickle.dump((type_ids, vocab_ids), open(os.path.join(out_dir, 'id_mixehr_seq.pkl'), 'wb'))
    pickle.dump((type_ids_rev, vocab_ids_rev), open(os.path.join(out_dir, 'id_seq_mixehr.pkl'), 'wb'))
예제 #10
0
            for param in self.parameters:
                self.__setattr__(param, hf[param][...])




def train_cv():
    kf = StratifiedKFold(5, shuffle=True, random_state=42)
    folder = '/Users/cuent/Downloads/processed_new/delete1'
    corpus = Corpus.read_corpus_from_directory(folder + "/train")
    for train_index, test_index in kf.split(corpus, corpus.labels):
        for i in train_index:
            corpus.dataset[i].train = True
        for i in test_index:
            corpus.dataset[i].train = False

if __name__ == '__main__':
    folder = '/Users/cuent/Downloads/processed_new/delete1'
    c_train = Corpus.read_corpus_from_directory(folder + "/train")
    # c_test = Corpus.read_corpus_from_directory(folder + "/test")
    # c_train = Corpus.read_corpus_from_directory("../split/train")
    # c_test = Corpus.read_corpus_from_directory("../split/test")

    K = 21
    mixehr = MixEHR(K, c_train.T, c_train.W)
    gamma = mixehr.infer(c_train, predict=True)
    # code.predict(c_test)
    # train_cv()