Esempio n. 1
0
        nb_add = min(nb_samples - nb_accepted, to_add.shape[0])
        to_add = to_add[:nb_add, :]
        accepted_samples[nb_accepted:(nb_accepted + nb_add), :] = to_add
        nb_accepted += nb_add
        print('{}/{}'.format(nb_accepted, nb_samples))

    return np.array(accepted_samples)


if __name__ == '__main__':
    # Load data
    root_gene = None  # 'CRP'
    minimum_evidence = 'weak'
    max_depth = np.inf
    expr, gene_symbols, sample_names = load_data(root_gene=root_gene,
                                                 minimum_evidence=minimum_evidence,
                                                 max_depth=max_depth)
    file_name = 'EColi_n{}_r{}_e{}_d{}'.format(len(gene_symbols), root_gene, minimum_evidence, max_depth)
    print('File: {}'.format(file_name))

    # Split data into train and test sets
    train_idxs, test_idxs = split_train_test(sample_names)
    expr_train = expr[train_idxs, :]
    expr_test = expr[test_idxs, :]

    # Load GAN
    ggan = gGAN(normalize(expr_train), gene_symbols)
    ggan.load_model(file_name)

    # Generate synthetic data
    mean = np.mean(expr_train, axis=0)
Esempio n. 2
0
def main():
    result = pp.load_data(FILE_NAME)
    for dict_ in result:
        print("=> ", dict_)
def demo_tfidf_diagnosis(df_seq=None):
    import os
    from data_pipeline import load_data
    from utils import Diagnosis

    col_key = 'Patient_id'
    col_date = 'Diag_date'
    col_code = 'ICD10'
    col_intv = 'History'

    n_samples = 1000
    tLoad = True

    if df_seq is None: 
        df_diag, df_treat, df_res = load_data(input_dir=os.getcwd(), verbose=False)
        diag = Diagnosis(df_diag) # create a Diagnosis object
        if tLoad: 
            df_seq = diag.load(dtype='seq') # load the pre-computed sequence data, because it's take a minute or two to sequence the data
        else: 
            df_seq = diag.sequence(tFilterByICD=True, tFilterByLength=False)
            # note: set tFilterByICD to True to only include valid (well-formatted) ICD10 codes
            #       set tFilterByLength to False to include the entire d-sequence for each patient
            #       Say you want to focus on only the most recent 100 days of diagnoses, then pass 
            #       n_days_lookback=100
    else: 
        assert not df_seq.empty

    pids = df_seq[col_key].values
    docs = df_seq.set_index(col_key)[col_code].to_dict()
    corpus = np.array([docs[pid] for pid in pids])
    Nd, Np = len(docs), len(pids)

    ngram_range = (1, 1)
    stop_words = []

    tokenizer = lambda doc: doc.split(" ")
    model = TfidfVectorizer(analyzer='word', tokenizer=tokenizer, ngram_range=ngram_range, 
                min_df=0, smooth_idf=True, lowercase=False, stop_words=stop_words) 

    Xtr = model.fit_transform(corpus)
    analyzer = model.build_analyzer()
    print("(demo_tfidf_diagnosis) ngram_range: {} => {}".format(ngram_range, analyzer("D64.9")))
    print(f"... Nd: {Nd}, dim(Xtr): {Xtr.shape}, size(vocab): {len(model.vocabulary_)}")
    assert Xtr.shape[0] == len(pids), f"number of rows in Xtr {Xtr.shape[0]} not equal to num of (unique) patients {len(pids)}"

    fset = model.get_feature_names()
    assert sum(1 for w in stop_words if w in fset) == 0, "Found stop words in the feature set!"
    print(f"... example feature names (n={len(fset)}=?={len(model.vocabulary_)}):\n{fset[:50]}\n{fset[-50:]}\n")

    n_examples = 10
    test_indices = np.random.choice(range(Nd), n_examples, replace=False)
    for i, dvec in enumerate(Xtr):
        # if i in test_indices: 
        #     print("...... doc #[{}]:\n{}\n".format(i, dvec.toarray()))
        assert np.sum(dvec) > 0

    print("... size(ICD10 codes): {}".format( len(model.vocabulary_) ))

    # --- interpretation 
    print("(demo_tfidf_diagnosis) Interpreting the TF-IDF model")
    topn = 3
    for i in range(Xtr.shape[0]):
        df_doc = top_features_in_doc(Xtr, features=fset, row_id=i, top_n=topn)
        if i in test_indices: 
            print("...... doc #{}:\n{}\n".format(i, df_doc.to_string(index=True)))

    topn = 10
    print("... top N ICD10 codes overall across all docs")
    df_topn = top_mean_features(Xtr, fset, grp_ids=None, min_tfidf=0.1, top_n=top_n)
    print("... doc(avg):\n{}\n".format(df_topn.to_string(index=True)))

    # --- interface
    # a. get the scores of individual tokens or n-grams in a given document? 
    print("(demo_tfidf_diagnosis) Get the scores of individual ICD10s in a given d-sequence")
    df = pd.DataFrame(Xtr.toarray(), columns = model.get_feature_names())
    print(df.head())

    # df.to_csv(Diagnosis.get_path(dtype='tfidf'), sep='|', index=False, header=True)

    return df
Esempio n. 4
0
    Plots the TF activity histogram. It is computed according to the Wilcoxon's non parametric rank-sum method, which tests
    whether TF targets exhibit significant rank differences in comparison with other non-target genes. The obtained
    p-values are corrected via Benjamini-Hochberg's procedure to account for multiple testing.
    :param expr: matrix of gene expressions. Shape=(nb_samples, nb_genes)
    :param gene_symbols: list of gene_symbols. Shape=(nb_genes,)
    :param tf_tg: dict with TF symbol as key and list of TGs' symbols as value
    :param xlabel: label on the x axis
    :param color: histogram color
    :return matplotlib axes
    """

    # Plot histogram
    values, _ = find_chip_rates(expr, gene_symbols, tf_tg)
    bins = np.logspace(-10, 1, 20, base=2)
    bins[0] = 0
    ax = plt.gca()
    plt.hist(values, bins=bins, color=color)
    ax.set_xscale('log', basex=2)
    ax.set_xlim(2**-10, 1)
    ax.set_xlabel(xlabel)
    ax.set_ylabel('Density')
    return ax


if __name__ == '__main__':
    r_expr, gene_symbols, sample_names = load_data(root_gene='crp')
    r_tf_tg_corr_flat, r_tg_tg_corr_flat = compute_tf_tg_corrs(r_expr,
                                                               gene_symbols,
                                                               flat=False)
    theta_dx_dz = phi_coefficient(r_tg_tg_corr_flat, r_tg_tg_corr_flat)
def main(_):
    """Main function for Domain Adaptation by Neural Networks - DANN"""
    tf.reset_default_graph()
    # Load MNIST and MNIST-M dataset
    (x_train, y_train), (x_test, y_test), (x_m_train, y_m_train), (x_m_test, y_m_test) = dp.load_data()

    # Configurations first
    iter_ratio = math.ceil((x_train.shape[0] / FLAGS.batch_size))
    print(iter_ratio)
    # We are working with transformed MNIST dataset => image shape is 28x28x3
    feature_columns = [tf.feature_column.numeric_column("x_s", shape=(28, 28, 3)),
                       tf.feature_column.numeric_column("x_t", shape=(28, 28, 3))]

    # Set up the session config
    session_config = tf.ConfigProto()
    session_config.gpu_options.allow_growth = True

    config = tf.estimator.RunConfig(
        save_checkpoints_steps=int(iter_ratio),
        log_step_count_steps=None,
        session_config=session_config
    )

    # Set up the estimator
    classifier = tf.estimator.Estimator(
        model_fn=estimator_model_fn,
        model_dir="./model",
        params={
            'feature_columns': feature_columns,
            'iter_ratio': iter_ratio
        },
        config=config
    )
    if FLAGS.mode == 'train':
        # Set up logging in training mode
        logging_hook = tf.train.LoggingTensorHook(
            tensors={"lr": "learning_rate", "loss": "loss", "source_class_acc": "source_class_acc",
                     "target_class_acc": "target_class_acc"},
            every_n_iter=int(iter_ratio))
        # Train DANN
        classifier.train(
            # input_fn=lambda: dp.train_input_fn({'x_s': x_train, 'x_t': x_m_train}, y_train, FLAGS.batch_size),
            input_fn=tf.estimator.inputs.numpy_input_fn({'x_s': x_train, 'x_t': x_m_train}, {'y_s': y_train, 'y_t': y_m_train},
                                                        shuffle=True, batch_size=128, num_epochs=FLAGS.total_epochs),
            max_steps=int(iter_ratio*FLAGS.total_epochs),
            hooks=[logging_hook]
        )
    if FLAGS.mode == 'eval':
        # Evaluate DANN
        eval_hooks = tf.train.LoggingTensorHook(
            tensors={"source_class_acc_2": "source_class_acc_2", "target_class_acc_2": "target_class_acc_2"},
            every_n_iter=1)
        eval_result = classifier.evaluate(
            input_fn=tf.estimator.inputs.numpy_input_fn(x={'x_s': x_test, 'x_t': x_m_test},
                                                        y={'y_s': y_test, 'y_t': y_m_test},
                                                        batch_size=128,
                                                        num_epochs=1,
                                                        shuffle=False),
            hooks=[eval_hooks]
        )
        print('\nSource set accuracy: {source_class_acc:0.3f}\nTarget set accuracy: {target_class_acc:0.3f}\n'.format(**eval_result))

    assert FLAGS.mode == 'predict', '-mode flag has to be one of "eval", "train", "predict".'