コード例 #1
0
def test_tasks():
    set_default(assembly_path=f'{os.getcwd()}/genomes')
    cell_line = 'HEK293'
    en, lab_en = data_retrieval(cell_line, 'enhancers')
    pr, lab_pr = data_retrieval(cell_line, 'promoters')
    en = fit_neighbours(en, 5)
    pr = fit_neighbours(pr, 5)
    epigenomes = {'enhancers': en, 'promoters': pr}
    labels = {'enhancers': lab_en, 'promoters': lab_pr}
    sequences = {'enhancers': to_bed(en), 'promoters': to_bed(pr)}
    _, _, _ = get_tasks(epigenomes, labels, sequences)
コード例 #2
0
def test_checks():
    cell_line = 'HEK293'
    en, lab_en = data_retrieval(cell_line, 'enhancers')
    pr, lab_pr = data_retrieval(cell_line, 'promoters')
    epigenomes = {'enhancers': en, 'promoters': pr}
    labels = {'enhancers': lab_en, 'promoters': lab_pr}
    overfitting_risk(epigenomes)
    nan_check(epigenomes)
    check_class_balance(labels)

    rmtree('datasets')
コード例 #3
0
def test_data_retrieval():
    cell_line = 'HepG2'
    region = 'promoters'
    epigenomes, labels = data_retrieval(cell_line=cell_line, region=region)
    epigenomes = fit_neighbours(epigenomes, 5)
    scores = drop_too_correlated(epigenomes)
    show({'promoters': epigenomes}, {'promoters': labels}, {'promoters': scores})
コード例 #4
0
def test_data_prediction():
    set_default(cell_line='HEK293',
                region='enhancers',
                epochs=2,
                splits=2,
                batch_size=1024,
                boruta_iterations=2,
                results_path=f'{os.getcwd()}/results')
    input_data, output_data = data_retrieval(get_default('cell_line'),
                                             get_default('region'))
    input_data_epi = fit_neighbours(input_data, 5)
    input_data_epi = apply_z_scoring(input_data_epi)
    input_data_epi = drop_constant_features(get_default('region'),
                                            input_data_epi)
    input_data_epi = drop_uncorrelated(input_data_epi, output_data)
    input_data_epi = get_filtered_with_boruta(input_data_epi, output_data,
                                              get_default('cell_line'),
                                              get_default('region'))
    shape = (input_data_epi.shape[1], )
    epi_models = [
        get_mlp_epigenomics()(shape, validation_split=0.1, name="MLP"),
        get_ffnn_epigenomics_v1()(shape, validation_split=0.1, name="FFNN")
    ]
    results = predict_epigenomics(input_data_epi.values,
                                  output_data.values.ravel(), epi_models)
    show_barplots(results, 'epi')
コード例 #5
0
def test_data_retrieval():
    cell_line = 'HepG2'
    region = 'promoters'
    epigenomes, labels = data_retrieval(cell_line=cell_line, region=region)
    epigenomes = fit_neighbours(epigenomes, 5)
    xs = [*[epigenomes.values]]
    ys = [*[labels.values.ravel()]]
    titles = ['Epigenomes promoters']
    show_decomposed_data(xs, ys, titles)
コード例 #6
0
def test_data_prediction():
    set_default(
        cell_line='HepG2',
        region='promoters',
        epochs=2,
        splits=2,
        batch_size=1024,
        results_path=f'{os.getcwd()}/results',
        assembly_path=f'{os.getcwd()}/genomes'
    )
    input_data_seq, output_data = data_retrieval(get_default('cell_line'), get_default('region'))
    input_data_seq = to_bed(input_data_seq)
    shape = (get_default('window_size'), len(get_default('nucleotides')))
    seq_models = [
        get_mlp_sequential()(shape, name="MLP")
    ]
    results = predict_sequences(input_data_seq, output_data.values.ravel(), seq_models)
    show_barplots(results, 'seq')
コード例 #7
0
def test_data_retrieval():
    cell_line = 'HepG2'
    region = 'promoters'
    data_retrieval(cell_line=cell_line, region=region)
コード例 #8
0
from bioinformatica.data_manipulation import fit_neighbours, apply_z_scoring, drop_constant_features, drop_uncorrelated
from bioinformatica.data_prediction import predict_epigenomics, predict_sequences, show_barplots
from bioinformatica.data_retrieval import data_retrieval, to_bed
from bioinformatica.defaults import set_default, get_default
from bioinformatica.models import get_mlp_epigenomics, get_ffnn_epigenomics_v1, get_ffnn_epigenomics_v2, get_ffnn_epigenomics_v3, \
    get_mlp_sequential, get_ffnn_sequential, get_cnn_sequential_v1

set_default(
    assembly='hg19',  # path
    cell_line='HEK293',
    region='promoters',
    dataset_path=
    r'C:\Users\matte\Documents\GitHub\bioinformatica\HepG2\datasets')

if __name__ == '__main__':
    input_data_o, output_data = data_retrieval(get_default('cell_line'),
                                               get_default('region'))

    input_data_seq = to_bed(
        input_data_o
    )  # annotate genome using index extracted from epigenomic data

    # epigenomic data's preproceccing
    input_data_epi = fit_neighbours(input_data_o, 5)  # NaN imputation
    input_data_epi = apply_z_scoring(input_data_epi)  # Normalizing
    # feature selection
    input_data_epi = drop_constant_features(get_default('region'),
                                            input_data_epi)
    input_data_epi = drop_uncorrelated(input_data_epi, output_data)
    input_data_epi = get_filtered_with_boruta(input_data_epi, output_data,
                                              get_default('cell_line'),
                                              get_default('region'))
コード例 #9
0
def test_data_retrieval():
    cell_line = 'HepG2'
    region = 'enhancers'
    input_data, output_data = data_retrieval(cell_line=cell_line, region=region)
    get_sequences(input_data)