def test_simulated_download_failure(): for _ in Genome("sacCer3", chromosomes=sacCer3_chromosomes).items(): pass sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes) path = sacCer3._chromosome_path("chrI") with open(path, "w") as f: f.write("Totally not JSON") with pytest.raises(Exception): sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes) sacCer3.delete()
def test_create_new_genome_object(): sacCer3 = Genome( "sacCer3", chromosomes=sacCer3_chromosomes, ) for path in glob("{path}/*.json".format(path=sacCer3.path)): os.remove(path) with pytest.warns(RuntimeWarning): sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes) sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes) sacCer3.gaps() sacCer3.filled() str(sacCer3) sacCer3.delete()
def test_gaps(): hg19 = Genome("hg19", chromosomes=["chr1"]) assert "chr1" in hg19 assert "chr2" not in hg19 filled = hg19.filled(chromosomes=["chr1"]) hg19.bed_to_sequence(filled) hg19.delete()
def test_tessellate(): hg19 = Genome("hg19", chromosomes=["chrM"]) filled = hg19.filled(chromosomes=["chrM"]) tessellate_bed(filled, window_size=200, alignment="left") tessellate_bed(filled, window_size=200, alignment="right") tessellate_bed(filled, window_size=200, alignment="center") hg19.delete()
def test_multivariate_gap_center_sequence(): hg19 = Genome("hg19", chromosomes=["chr1", "chr2", "chr3"]) _, mean, covariance = get_gaps_statistics( hg19, 100, 200 ) gap_sequence = MultivariateGapCenterSequence( assembly=hg19, bed=get_test_bed(), gaps_mean=mean, gaps_covariance=covariance, batch_size=32 ) x1, y1 = gap_sequence[0] x2, y2 = gap_sequence[0] assert (x1 == 0.25).any() assert set((0.25, 0.0, 1.0)) == set(np.unique(x1)) assert (x1 == x2).all() assert (y1 == y2).all() assert x1.shape == (gap_sequence.batch_size, 200, 4) cnn_model().fit_generator( gap_sequence, steps_per_epoch=gap_sequence.steps_per_epoch, epochs=2, verbose=0, shuffle=True )
def get_sequence(epigenomes, region): window_size = 200 genome = Genome('hg19') sequences = { region: to_dataframe(flat_one_hot_encode(genome, data, window_size), window_size) for region, data in epigenomes.items() } return sequences
def test_expand_bed_regions(): hg19 = Genome("hg19", chromosomes=["chr2", "chr3"]) gaps = hg19.gaps(chromosomes=["chr2", "chr3"]) gaps = gaps[gaps.chromEnd - gaps.chromStart < 500] result = expand_bed_regions(gaps, 200, "left") assert (result.chromEnd - result.chromStart == 200).all() result = expand_bed_regions(gaps, 201, "right") assert (result.chromEnd - result.chromStart == 201).all() result = expand_bed_regions(gaps, 200, "center") assert (result.chromEnd - result.chromStart == 200).all() result = expand_bed_regions(gaps, 201, "center") assert (result.chromEnd - result.chromStart == 201).all() result = expand_bed_regions(gaps, 173, "center") assert (result.chromEnd - result.chromStart == 173).all()
def get_data( parameters: Tuple[Tuple[str, int, str], str] ) -> Tuple[pd.DataFrame, np.array] or List[np.array, np.array]: load_parameters, data_type = parameters if data_type == 'epigenomic': dataset, labels = load_dataset(load_parameters) dataset.reset_index(drop=True, inplace=True) return dataset, labels if data_type == 'sequences': epigenomes, labels = load_dataset(load_parameters) genome = Genome('hg19') bed = epigenomes.reset_index()[epigenomes.index.names] batch_size = len(labels) return [ data for data in MixedSequence(x=BedSequence( genome, bed.iloc[np.arange( batch_size)], batch_size=batch_size), y=labels[np.arange(batch_size)], batch_size=batch_size) ][0]
def test_wiggle(): hg19 = Genome("hg19", chromosomes=["chr17"]) filled = hg19.filled(chromosomes=["chr17"]) wiggles = wiggle_bed_regions( filled, max_wiggle_size=100, wiggles=10, seed=42 ) path = "{pwd}/expected_wiggles.csv".format( pwd=os.path.dirname(os.path.abspath(__file__)) ) if not os.path.exists(path): wiggles.to_csv(path, index=False) pd.testing.assert_frame_equal( wiggles, pd.read_csv(path), check_dtype=False ) hg19.delete()
def test_gaps(): hg19 = Genome("hg19", chromosomes=["chr1"]) assert "chr1" in hg19 assert "chr2" not in hg19 # Check that no gap is with 0 length gaps = hg19.gaps(["chr1"]) assert (gaps.chromEnd - gaps.chromStart != 0).all() # Converting gaps to sequences: should all be Nns gaps_tesselate = tessellate_bed(gaps, 200, verbose=False) gaps_sequences = hg19.bed_to_sequence(gaps_tesselate) for gap in gaps_sequences: assert set(gap.lower()) == set(["n"]) filled = hg19.filled(["chr1"]) assert (filled.chromEnd - filled.chromStart != 0).all() filled_tesselate = tessellate_bed(filled, 200, verbose=False) filled_sequences = hg19.bed_to_sequence(filled_tesselate) for fl in filled_sequences: assert "n" not in fl.lower() filled_tesselate["strand"] = "." filled_sequences = hg19.bed_to_sequence(filled_tesselate) for fl in filled_sequences: assert "n" not in fl.lower() hg19.delete()
def test_genomic_sequence_determinism(): batch_size = 32 epochs = 5 enhancers = pd.read_csv("tests/enhancers.csv") promoters = pd.read_csv("tests/promoters.csv") genome = Genome("hg19", chromosomes=["chr1"]) for region in tqdm((enhancers, promoters), desc="Region types"): y = np.arange(0, len(region), dtype=np.int64) mixed_sequence = MixedSequence(x=BedSequence(genome, region, batch_size), y=VectorSequence(y, batch_size)) reference_mixed_sequence = MixedSequence( x=BedSequence(genome, region, batch_size=len(region), shuffle=False), y=VectorSequence(y, batch_size=len(region), shuffle=False)) X, _ = reference_mixed_sequence[0] for _ in trange(epochs, desc="Epochs", leave=False): for step in range(mixed_sequence.steps_per_epoch): xi, yi = mixed_sequence[step] assert (X[yi.astype(int)] == xi).all() mixed_sequence.on_epoch_end()
def get_genome() -> Genome: """Download genome or retrieve it if given path""" genome = _cache.get('genome') or Genome( 'hg19', cache_directory=get_default('assembly_path')) _cache['genome'] = genome return genome
def __init__(self, assembly, window_size, batch_size, buffer_size=None, max_gap_size=100, train_chromosomes=None, val_chromosomes=None, cache_dir=None, lazy_load=True, clear_cache=False, compile_on_start=True, n_type="uniform"): self.assembly, self.window_size = assembly, window_size self.max_gap_size, self.batch_size, self.val_chromosomes = max_gap_size, batch_size, val_chromosomes # Buffersize default None == cpu count for optimal performance: if not buffer_size: buffer_size = cpu_count() self.buffer_size = buffer_size # Validate the type of N if n_type not in self.n_types: raise ValueError("n_type must be one of %s" % n_type) self.n_type = n_type # Get the cache dir cache_dir = cache_dir or os.environ.get("CACHE_PATH", None) or "/tmp" self._cache_directory = "/".join( [cache_dir, assembly, str(window_size)]) if clear_cache: self.clean_cache() # Generate a pool of processes to save the overhead self.workers = max(2, cpu_count()) self.pool = Pool(self.workers) # Preprocess all the possible data self.genome = Genome( assembly=assembly, lazy_load=lazy_load, cache_directory=cache_dir, ) if not val_chromosomes: self.val_chromosomes = [] # If no chromosomes passed then use all the genome if not train_chromosomes: self.chromosomes = sorted(list(self.genome)) else: self.chromosomes = train_chromosomes + self.val_chromosomes self.instance_hash = sha256({ "assembly": self.assembly, "chromosomes": self.chromosomes, "window_size": self.window_size, "max_gap_size": self.max_gap_size, "n_type": n_type, }) if compile_on_start: self.compile()
def get_genome(assembly): return Genome(assembly)
def test_empty_genome(): with pytest.raises(ValueError): Genome("hg19", filters=("", ))
def test_unavailable_genome(): with pytest.raises(ValueError): Genome("hg1")
def get_holdouts(batch_size: int = 128, max_wiggle_size: int = 150, wiggles: int = 10, random_state: int = 42, window_size: int = 500, test_size: float = 0.3, verbose: bool = True, nrows: int = None): """Return generator with training and testing holdouts. Parameters --------------------------- batch_size: int = 128, The batch size to use. Since the task is significantly unbalances, consider using high batch sizes. max_wiggle_size: int = 150, Amount to wiggle the windows. wiggles: int = 10, Number of wiggles per sample. random_state: int = 42, Random state to use for reproducibility. window_size: int = 500, Window size to use. test_size: float = 0.3, Percentage to leave for the test set. verbose: bool = True Wethever to show or not the loading bar. nrows: int = None, Number of rows to read. Useful to test the pipeline. Raises ---------------------------- ValueError, If given window size if less or equal than the double of given maximum wiggle size. Returns ---------------------------- Generator with the training holdouts. """ if window_size <= max_wiggle_size * 2: raise ValueError( ("Given window size {} is less or equal than twice the " "given max_wiggle_size {}. This may lead the central SNV " "to fall outside the region, hence causing a false positive. " "Please either increase the window size or reduce the " "maximum wiggle size.").format(window_size, max_wiggle_size)) # Load the bed file bed = pd.read_csv(os.path.join(os.path.dirname(os.path.abspath(__file__)), "mendelian_snv.csv.gz"), nrows=nrows) # Expand (or compress) given bed file windows to required size bed = expand_bed_regions(bed, window_size) # Load the genomic assembly assembly = Genome("hg19", verbose=False) # Retrieve set of unique folds unique_folds = bed.folds.unique() # For each holdout for fold in tqdm( unique_folds, desc="Holdouts", disable=not verbose, ): # Compute the folds mask folds_mask = (bed.folds != fold).values # We get the training bed partition # In this partition, we get all the folds that do not go into the # test partition. train_bed = bed.iloc[folds_mask] # And the testing bed partition # In this partition we leave only the single fold that in this iteration # of the 10-fold CV we have left out from the train. test_bed = bed.iloc[~folds_mask] # We wiggle the bed regions the desired amount to generate # the required amount of wiggles. # We wiggle only the training positives, as wiggling the training # negatives might create false negatives. positives = train_bed[(train_bed.labels == 1).values] # If wiggles are requestes if wiggles > 0: # Computing the wiggles wiggled_train_bed = wiggle_bed_regions(positives, max_wiggle_size, wiggles, random_state) # Concatenig the training data train_bed = pd.concat([wiggled_train_bed, train_bed]) # Shuffle the training data # INFO: This shuffle should not be needed, but just for peace of mind. train_bed = train_bed.sample(frac=1, random_state=random_state + fold) # Shuffle the test data # INFO: This shuffle should not be needed, but just for peace of mind. test_bed = test_bed.sample(frac=1, random_state=random_state + fold) # And we return the computed training sequences. yield (create_sequence(train_bed, assembly, batch_size), create_sequence(test_bed, assembly, batch_size))
def visualize(cell_line, epigenomes, labels): genome = Genome("hg19") sequences = { region: to_dataframe(flat_one_hot_encode(genome, data, 200), 200) for region, data in epigenomes.items() } tasks = { "x": [ *[val.values for val in epigenomes.values()], *[val.values for val in sequences.values()] ], "y": [ *[val.values.ravel() for val in labels.values()], *[val.values.ravel() for val in labels.values()] ], "titles": [ "Epigenomes promoters", "Epigenomes enhancers", "Sequences promoters", "Sequences enhancers" ] } xs = tasks["x"] ys = tasks["y"] titles = tasks["titles"] assert len(xs) == len(ys) == len(titles) for x, y in zip(xs, ys): assert x.shape[0] == y.shape[0] print("test") colors = np.array([ "tab:blue", "tab:orange", ]) fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(32, 8)) for x, y, title, axis in tqdm(zip(xs, ys, titles, axes.flatten()), desc="Computing PCAs", total=len(xs)): axis.scatter(*pca(x).T, s=1, color=colors[y]) axis.xaxis.set_visible(False) axis.yaxis.set_visible(False) axis.set_title(f"PCA decomposition - {title}") plt.savefig("./imgs/" + cell_line + "/PCA decomposition") plt.show() for perpexity in tqdm((50, 500), desc="Running perplexities"): fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(40, 10)) for x, y, title, axis in tqdm(zip(xs, ys, titles, axes.flatten()), desc="Computing TSNEs", total=len(xs)): axis.scatter(*ulyanov_tsne(x, perplexity=perpexity).T, s=1, color=colors[y]) axis.xaxis.set_visible(False) axis.yaxis.set_visible(False) axis.set_title(f"TSNE decomposition - {title}") fig.tight_layout() fig.savefig("./imgs/" + cell_line + f"/TSNE_" + str(perpexity)) plt.show()
def train_model_seq(models, epigenomes, nlabels, region_type, cell_line): # Reprod os.environ['PYTHONHASHSEED'] = '0' np.random.seed(42) splits = 11 holdouts = StratifiedShuffleSplit( n_splits=splits, test_size=0.2, random_state=42) genome = Genome("hg19") bed = to_bed(epigenomes[region_type]) labels = nlabels[region_type].values.ravel() if os.path.exists(cell_line + "_" + region_type + "_sequence.json"): results = compress_json.local_load( cell_line + "_" + region_type + "_sequence.json") else: results = [] class_w = class_weight.compute_class_weight( 'balanced', np.unique(labels), labels) class_w = dict(enumerate(class_w)) print("Class weights: " + str(class_w)) for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True): train, test = get_holdout( train_index, test_index, bed, labels, genome, 1024) print("="*80) for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True): if precomputed(results, model.name, i): continue history = model.fit( train, steps_per_epoch=train.steps_per_epoch, validation_data=test, validation_steps=test.steps_per_epoch, epochs=1000, shuffle=True, verbose=False, class_weight=class_w, callbacks=[ EarlyStopping(monitor="val_loss", mode="min", patience=50, restore_best_weights=True), ] ).history scores = pd.DataFrame(history).iloc[-1].to_dict() results.append({ "model": model.name, "run_type": "train", "holdout": i, **{ key: value for key, value in scores.items() if not key.startswith("val_") } }) results.append({ "model": model.name, "run_type": "test", "holdout": i, **{ key[4:]: value for key, value in scores.items() if key.startswith("val_") } }) compress_json.local_dump( results, cell_line + "_" + region_type + "_sequence.json") df = pd.DataFrame(results).drop(columns="holdout") return df
def preprocess_mode_exec(c): logging.basicConfig(format='[%(asctime)s] - %(levelname)s - %(message)s', level=logging.DEBUG) logging.debug("PREPROCESSING MODE") root_path = c['import_path'] saving_path = c['export_path'] cell_lines = c['cell_lines'] window_size = c['window_size'] dataset_type = c['dataset'] if not os.path.exists(root_path): raise FileNotFoundError("Files path not found: {}".format(root_path)) if not os.path.exists(saving_path): logging.debug("{} not found, folder will be created") os.makedirs(saving_path) label_epi_path = get_full_path(root_path, window_size, dataset_type) # Importing regions for enhancers and promoters enhancers_regions, promoters_regions = get_regions(root_path) # Importing and converting labels of enhancers and promoters and join them in a single dataframe full_sequences = get_categorical_labels(label_epi_path) logging.debug("Saving the sequences bed file in {}".format(saving_path)) rows = 0 if c['sample']: sample_size = int(len(full_sequences) * c['sample_perc']) rows = np.random.randint(len(full_sequences), size=sample_size) full_sequences = full_sequences.iloc[rows] full_sequences.to_csv("{}/sequences.bed".format(saving_path), sep="\t", columns=['chrom', 'chromStart', 'chromEnd'], header=False, index=False) logging.debug("Downloading the hg19 genome") chroms = [k for k, _ in full_sequences.groupby(['chrom'])] hg19 = Genome(assembly="hg19", chromosomes=chroms) logging.debug("Downloading the hg19 genome") sequences = hg19.bed_to_sequence(full_sequences) logging.debug("Saving sequences to file...") seqIO_seq = [ creating_seqIO( "{}:{}-{}".format(row['chrom'], row['chromStart'], row['chromEnd']), Seq(row['sequence'].upper())) for _, row in sequences.iterrows() ] save_sequences(saving_path, seqIO_seq) # Importing epigenetic data logging.debug("Importing epigenetic data for: {}".format( ", ".join(cell_lines))) logging.debug( "-------------------------------------------------------------") for l in cell_lines: logging.debug("Importing {} data".format(l)) df_epi_enanchers, df_epi_promoters = get_epigenetic_data( label_epi_path, l) # building type dictionary converting_dictionary = { c: get_type(c) for c in df_epi_promoters.columns } df_epi_enanchers = df_epi_enanchers.astype(converting_dictionary) df_epi_promoters = df_epi_promoters.astype(converting_dictionary) assert len(df_epi_promoters.columns) == len(df_epi_enanchers.columns) logging.debug("number features for {}: {}".format( l, len(df_epi_promoters.columns) - 4)) logging.debug("Number of missing values in enhancers: {}".format( df_epi_enanchers.isna().sum().sum())) logging.debug("Number of missing values in promoters: {}".format( df_epi_promoters.isna().sum().sum())) df_epi_enanchers = fill_missing(df_epi_enanchers, metric="median") df_epi_promoters = fill_missing(df_epi_promoters, metric="median") assert len(enhancers_regions) == len(df_epi_enanchers) logging.debug("Enhancers - regions: {}, epigenetics: {}".format( len(enhancers_regions), len(df_epi_enanchers))) assert len(promoters_regions) == len(df_epi_promoters) logging.debug("Promoters - regions: {}, epigenetics: {}".format( len(promoters_regions), len(df_epi_promoters))) full_epi = append_without_duplicates(df_epi_enanchers, df_epi_promoters) if c['sample']: full_epi = full_epi.iloc[rows] # Check if the data are aligned dataframe are equals before save. assert len(full_sequences) == len(full_epi) assert_frame_equal(full_sequences[['chrom', 'chromStart', 'chromEnd']], full_epi[['chrom', 'chromStart', 'chromEnd']]) logging.debug("Number of total sequences: {}".format( len(full_sequences))) logging.debug("Saving results in {}".format(saving_path)) np.savetxt("{}/{}_epigenetic.txt".format(saving_path, l), full_epi.iloc[:, 4:].values, fmt='%f') np.savetxt("{}/{}_labels.txt".format(saving_path, l), full_sequences[l].values, fmt='%s') logging.debug( "-------------------------------------------------------------")
#!/usr/bin/env python # coding: utf-8 import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn from tqdm import tqdm tqdm.pandas() import matplotlib.colors import scipy.stats from ucsc_genomes_downloader import Genome hg38 = Genome(assembly="hg38") import os import pysam import argparse parser = argparse.ArgumentParser( description='Process histograms, scatter plots and metaplots') parser.add_argument('cell_type') parser.add_argument('tabix_file') parser.add_argument('fragments') args = parser.parse_args() cell_type = args.cell_type tabix_file = pysam.TabixFile(args.tabix_file) os.system( 'gunzip -c {} | bedtools intersect -sorted -c -a /home/John/JohnProject/reference/DHS_adjusted_6mer_bias_adjustedby_30_sorted_no_blacklist.unique.bed -b - > {}/index_cuts_{}_intersect.bed'