def test_tasks(): set_default(assembly_path=f'{os.getcwd()}/genomes') cell_line = 'HEK293' en, lab_en = data_retrieval(cell_line, 'enhancers') pr, lab_pr = data_retrieval(cell_line, 'promoters') en = fit_neighbours(en, 5) pr = fit_neighbours(pr, 5) epigenomes = {'enhancers': en, 'promoters': pr} labels = {'enhancers': lab_en, 'promoters': lab_pr} sequences = {'enhancers': to_bed(en), 'promoters': to_bed(pr)} _, _, _ = get_tasks(epigenomes, labels, sequences)
def test_checks(): cell_line = 'HEK293' en, lab_en = data_retrieval(cell_line, 'enhancers') pr, lab_pr = data_retrieval(cell_line, 'promoters') epigenomes = {'enhancers': en, 'promoters': pr} labels = {'enhancers': lab_en, 'promoters': lab_pr} overfitting_risk(epigenomes) nan_check(epigenomes) check_class_balance(labels) rmtree('datasets')
def test_data_retrieval(): cell_line = 'HepG2' region = 'promoters' epigenomes, labels = data_retrieval(cell_line=cell_line, region=region) epigenomes = fit_neighbours(epigenomes, 5) scores = drop_too_correlated(epigenomes) show({'promoters': epigenomes}, {'promoters': labels}, {'promoters': scores})
def test_data_prediction(): set_default(cell_line='HEK293', region='enhancers', epochs=2, splits=2, batch_size=1024, boruta_iterations=2, results_path=f'{os.getcwd()}/results') input_data, output_data = data_retrieval(get_default('cell_line'), get_default('region')) input_data_epi = fit_neighbours(input_data, 5) input_data_epi = apply_z_scoring(input_data_epi) input_data_epi = drop_constant_features(get_default('region'), input_data_epi) input_data_epi = drop_uncorrelated(input_data_epi, output_data) input_data_epi = get_filtered_with_boruta(input_data_epi, output_data, get_default('cell_line'), get_default('region')) shape = (input_data_epi.shape[1], ) epi_models = [ get_mlp_epigenomics()(shape, validation_split=0.1, name="MLP"), get_ffnn_epigenomics_v1()(shape, validation_split=0.1, name="FFNN") ] results = predict_epigenomics(input_data_epi.values, output_data.values.ravel(), epi_models) show_barplots(results, 'epi')
def test_data_retrieval(): cell_line = 'HepG2' region = 'promoters' epigenomes, labels = data_retrieval(cell_line=cell_line, region=region) epigenomes = fit_neighbours(epigenomes, 5) xs = [*[epigenomes.values]] ys = [*[labels.values.ravel()]] titles = ['Epigenomes promoters'] show_decomposed_data(xs, ys, titles)
def test_data_prediction(): set_default( cell_line='HepG2', region='promoters', epochs=2, splits=2, batch_size=1024, results_path=f'{os.getcwd()}/results', assembly_path=f'{os.getcwd()}/genomes' ) input_data_seq, output_data = data_retrieval(get_default('cell_line'), get_default('region')) input_data_seq = to_bed(input_data_seq) shape = (get_default('window_size'), len(get_default('nucleotides'))) seq_models = [ get_mlp_sequential()(shape, name="MLP") ] results = predict_sequences(input_data_seq, output_data.values.ravel(), seq_models) show_barplots(results, 'seq')
def test_data_retrieval(): cell_line = 'HepG2' region = 'promoters' data_retrieval(cell_line=cell_line, region=region)
from bioinformatica.data_manipulation import fit_neighbours, apply_z_scoring, drop_constant_features, drop_uncorrelated from bioinformatica.data_prediction import predict_epigenomics, predict_sequences, show_barplots from bioinformatica.data_retrieval import data_retrieval, to_bed from bioinformatica.defaults import set_default, get_default from bioinformatica.models import get_mlp_epigenomics, get_ffnn_epigenomics_v1, get_ffnn_epigenomics_v2, get_ffnn_epigenomics_v3, \ get_mlp_sequential, get_ffnn_sequential, get_cnn_sequential_v1 set_default( assembly='hg19', # path cell_line='HEK293', region='promoters', dataset_path= r'C:\Users\matte\Documents\GitHub\bioinformatica\HepG2\datasets') if __name__ == '__main__': input_data_o, output_data = data_retrieval(get_default('cell_line'), get_default('region')) input_data_seq = to_bed( input_data_o ) # annotate genome using index extracted from epigenomic data # epigenomic data's preproceccing input_data_epi = fit_neighbours(input_data_o, 5) # NaN imputation input_data_epi = apply_z_scoring(input_data_epi) # Normalizing # feature selection input_data_epi = drop_constant_features(get_default('region'), input_data_epi) input_data_epi = drop_uncorrelated(input_data_epi, output_data) input_data_epi = get_filtered_with_boruta(input_data_epi, output_data, get_default('cell_line'), get_default('region'))
def test_data_retrieval(): cell_line = 'HepG2' region = 'enhancers' input_data, output_data = data_retrieval(cell_line=cell_line, region=region) get_sequences(input_data)