def test_genome__init__(genome="tests/data/small_genome.fa.gz"): # no fasta file with pytest.raises(FileNotFoundError): genomepy.Genome("empty", "tests/data/genome") # genome dir not found with pytest.raises(FileNotFoundError): genomepy.Genome("unknown", "unknown") readme = "tests/data/README.txt" if os.path.exists(readme): os.unlink(readme) g = genomepy.Genome(genome) assert g.genomes_dir == genomepy.utils.get_genomes_dir(None, False) assert g.name == "small_genome" assert g.filename == os.path.abspath(genome) assert g.genome_dir == os.path.dirname(g.filename) assert os.path.exists(g.index_file) assert os.path.exists(g.sizes_file) assert os.path.exists(g.gaps_file) assert isinstance(g.sizes, dict) assert isinstance(g.gaps, dict) assert g.annotation_gtf_file is None assert g.annotation_bed_file is None assert g.tax_id == g.assembly_accession == "na" assert isinstance(g.plugin, dict)
def test__bed_to_seqs(genome="tests/data/small_genome.fa.gz", track="tests/data/regions.bed"): g = genomepy.Genome(genome) # extract sequences marked in regions.bed from small_genome.fa.gz seqs = g._bed_to_seqs(track=track, stranded=False, extend_up=0, extend_down=0) for i, seq in enumerate(seqs): assert seq.name == ["chrI:10-20 gene_a", "chrII:20-30 gene_b"][i] assert seq.seq == ["CCCACACACC", "TCCTCCAAGC"][i] # second sequence is on the negative strand seqs = g._bed_to_seqs(track=track, stranded=True, extend_up=0, extend_down=0) for i, seq in enumerate(seqs): assert seq.name == ["chrI:10-20 gene_a", "chrII:20-30 gene_b"][i] # original: "CCCACACACC", "TCCTCCAAGC" assert seq.seq == ["CCCACACACC", "GCTTGGAGGA"][i] # extend by varying amounts seqs = g._bed_to_seqs(track=track, stranded=True, extend_up=1, extend_down=2) for i, seq in enumerate(seqs): assert seq.name == ["chrI:10-20 gene_a", "chrII:20-30 gene_b"][i] # original: "CCCACACACC", "GCTTGGAGGA" assert seq.seq == ["ACCCACACACCCA", "GGCTTGGAGGAGA"][i]
def test__update_metadata(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) metadata = {"provider": "NCBI", "original name": "ASM14646v1"} g._update_metadata(metadata) assert metadata["tax_id"] == "58839" assert metadata["assembly_accession"] == "GCA_000146465.1"
def get_chromsizes_from_genomepy( genome, saveas=None, ): """ Get chrom size info for *genome* from genomepy, if genomepy is installed. Parameters ---------- genome : str Name of the genome assembly (e.g., "hg38") saveas : str Filename to save output to. Dictionary will still be returned. """ if "genomepy" not in sys.modules: return None d = {} try: g = genomepy.Genome(genome) # Fail silently if the sizes file cannot be accessed if not hasattr(g, "sizes_file"): return None for line in open(g.sizes_file): chrom, size = line.split() d[chrom] = (0, int(size)) if saveas is not None: chromsizes_to_file(d, saveas) except FileNotFoundError: return None return d
def get_ref_fasta(genome): if is_az(): path = '/ngs/reference_data/genomes/Hsapiens/' + genome + '/seq/' + genome + '.fa' if isfile(path): logger.info('Found genome fasta at ' + path) return path if isdir(join(DATA_DIR, 'genomes', genome)): genome_dir = safe_mkdir(join(DATA_DIR, 'genomes')) else: genome_dir = safe_mkdir(join(DATA_DIR, '..', 'genomes')) if genome not in genomepy.list_installed_genomes(genome_dir): genome_rec = [ rec for rec in genomepy.list_available_genomes() if rec[1] == genome ] if genome_rec: genome_rec = genome_rec[0] else: logger.critical('Error: genome ' + genome + ' is not available') logger.info('Downloading genome ' + genome + ' from ' + genome_rec[1] + ' and installing into ' + genome_dir) genomepy.install_genome(genome, 'UCSC', genome_dir=genome_dir) genome_fasta_file = genomepy.Genome(genome, genome_dir=genome_dir).filename return genome_fasta_file
def test_install_genome(): localname = "my_genome" genomepy.functions.install_genome( name="tests/data/sacCer3/sacCer3.fa", provider="Local", genomes_dir=None, localname=localname, regex="chrIV", annotation=True, force=True, ) genomes_dir = genomepy.functions.get_genomes_dir(None, False) genome_file = os.path.join(genomes_dir, localname, localname + ".fa") assert os.path.exists(genome_file) sizes_file = os.path.join(genomes_dir, localname, localname + ".fa.sizes") assert os.path.exists(sizes_file) gaps_file = os.path.join(genomes_dir, localname, localname + ".gaps.bed") assert os.path.exists(gaps_file) annotation_file = os.path.join(genomes_dir, localname, localname + ".annotation.gtf") assert os.path.exists(annotation_file) # regex test: sizes = genomepy.Genome(localname).sizes.keys() assert "chrIV" in sizes
def combine_peaks(peaks, genome, window, scale_value): """ Combine multiple MACS2 summit files and returns the summit with the maximum value. Parameters ---------- peaks : list List with summit file names. genome : str Genome file name. Either a file with chromosome sizes or a genomepy genome name. window : int Window size. Summits will be extended to this size before merging. scale_value : bool Scale summit values before taking the maximum. Returns ------- summits : pandas.DataFrame DataFrame with summits. """ try: g = genomepy.Genome(genome) genome = g.sizes_file except Exception: pass dfs = [read_peak_file_to_df(fname) for fname in peaks] df_all = pd.concat(dfs) check_col = "log_value" if scale_value: check_col = "log_value_scaled" # store summit location + associated value in col4 df_all["col4"] = (df_all["chrom"].astype(str) + ";" + df_all["start"].astype(str) + ";" + df_all["end"].astype(str) + ";" + df_all[check_col].astype(str)) tmp = NamedTemporaryFile(suffix=".all_peaks.bed", delete=False).name out = df_all[["chrom", "start", "end", "col4"]].sort_values(["chrom", "start"]) out.to_csv(tmp, sep="\t", index=False, header=False) b = BedTool(tmp) all_summits = [] # loop over merged peaks based on window size and collapse on col4 (summit + value) for f in b.slop(b=window // 2, g=genome).merge(c=4, o="collapse"): summits = [x.split(";") for x in f[3].split(",")] # only keep the highest summit all_summits.append(sorted(summits, key=lambda x: float(x[3]))[-1][:3]) df = pd.DataFrame(all_summits, columns=["chrom", "start", "end"]) return df
def test_ucsc_genome(genome="sacCer3", provider="UCSC"): """Test UCSC. Download S. cerevisiae genome from UCSC and retrieve a specific sequence.""" tmp = mkdtemp() genomepy.install_genome(genome, provider, genome_dir=tmp) g = genomepy.Genome(genome, genome_dir=tmp) seq = g["chrIV"][1337000:1337020] assert str(seq) == "TTTGGTTGTTCCTCTTCCTT"
def test_gaps(genome="tests/data/gap.fa"): g = genomepy.Genome(genome) assert list(g.gaps.keys()) == ["chr1", "chr3"] # does not overwrite user-set gaps g.gaps = {"asd": 1} assert g.gaps == {"asd": 1} # repopulates empty dicts g.gaps = {} assert list(g.gaps.keys()) == ["chr1", "chr3"]
def test__region_to_seq(genome="tests/data/small_genome.fa.gz", region="chrI:10-20"): g = genomepy.Genome(genome) # extract sequences marked in track from small_genome.fa.gz seq = g._region_to_seq(region=region, extend_up=0, extend_down=0) assert seq == "CCCACACACC" # extend by varying amounts seq = g._region_to_seq(region=region, extend_up=1, extend_down=2) # original: "CCCACACACC" assert seq == "ACCCACACACCCA"
def test_ensembl_human(): """Test Ensembl. Download human genome from Ensembl and retrieve a specific sequence. """ tmp = mkdtemp() genomepy.install_genome("GRCh38.p10", "Ensembl", genome_dir=tmp) g = genomepy.Genome("GRCh38.p10", genome_dir=tmp) seq = g["6"][166168664:166168679] assert str(seq) == "CCTCCTCGCTCTCTT" shutil.rmtree(tmp)
def test_ensembl_genome(): """Test Ensembl. Download Drosophila genome from Ensembl and retrieve a specific sequence. """ tmp = mkdtemp() genomepy.install_genome("BDGP6", "Ensembl", genome_dir=tmp) g = genomepy.Genome("BDGP6", genome_dir=tmp) seq = g["3L"][10637840:10637875] assert str(seq).upper() == "TTTGCAACAGCTGCCGCAGTGTGACCGTTGTACTG" shutil.rmtree(tmp)
def test_ncbi_genome(genome="ASM2732v1", provider="NCBI"): """Test NCBI. Download smallest genome from NCBI and retrieve a specific sequence. """ tmp = mkdtemp() genomepy.install_genome(genome, provider, genome_dir=tmp) g = genomepy.Genome(genome, genome_dir=tmp) seq = g["ANONYMOUS"][80:107] assert str(seq).upper() == "ATACCTTCCTTAATACTGTTAAATTAT" shutil.rmtree(tmp)
def test_ucsc_human(): """Test UCSC. Download human genome from UCSC and retrieve a specific sequence. """ tmp = mkdtemp() genomepy.install_genome("hg38", "UCSC", genome_dir=tmp) g = genomepy.Genome("hg38", genome_dir=tmp) seq = g["chr6"][166168664:166168679] assert str(seq) == "CCTCCTCGCTCTCTT" shutil.rmtree(tmp)
def test_ncbi_genome(): """Test NCBI. Download Drosophila genome from NCBI and retrieve a specific sequence. """ tmp = mkdtemp() genomepy.install_genome("Release 6 plus ISO1 MT", "NCBI", genome_dir=tmp) g = genomepy.Genome("Release_6_plus_ISO1_MT", genome_dir=tmp) seq = g["3L"][10637840:10637875] assert str(seq).upper() == "TTTGCAACAGCTGCCGCAGTGTGACCGTTGTACTG" shutil.rmtree(tmp)
def test_ncbi_human(): """Test NCBI. Download human genome from NCBI and retrieve a specific sequence. """ tmp = mkdtemp() genomepy.install_genome("GRCh38.p9", "NCBI", genome_dir=tmp) g = genomepy.Genome("GRCh38.p9", genome_dir=tmp) seq = g["6"][166168664:166168679] assert str(seq) == "CCTCCTCGCTCTCTT" shutil.rmtree(tmp)
def test_ensembl_genome(genome="KH", provider="Ensembl", version=98): """Test Ensembl. Download smallest genome from Ensembl's HTTPS and retrieve a specific sequence. """ tmp = mkdtemp() # Only test on vertebrates as these are downloaded over HTTPS. # All others are downloaded over FTP, which is unreliable on Travis. genomepy.install_genome(genome, provider, genome_dir=tmp, version=version) g = genomepy.Genome(genome, genome_dir=tmp) seq = g["1"][40:60] assert str(seq).upper() == "nnnnnnnnnnAACCCCTAAC".upper() shutil.rmtree(tmp)
def test_sizes(genome="tests/data/gap.fa"): g = genomepy.Genome(genome) assert list(g.sizes.keys()) == ["chr1", "chr2", "chr3"] assert all(isinstance(g.sizes[chrom], int) for chrom in g.sizes.keys()) assert g.sizes["chr1"] == 28 # does not overwrite user-set sizes g.sizes = {"asd": 1} assert g.sizes == {"asd": 1} # repopulates empty dicts g.sizes = {} assert list(g.sizes.keys()) == ["chr1", "chr2", "chr3"]
def test_check_annotation_file(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) # does not exist gtf = g.check_annotation_file("gtf") assert gtf is None # does exist path = "tests/data/small_genome.annotation.test.gz" with open(path, "w") as fa: fa.write("test") test = g.check_annotation_file("test") assert test == os.path.abspath(path) os.unlink(path)
def test__parse_name(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) # unimportant # name name = g._parse_name("test") assert name == "test" # file name = g._parse_name("/home/genomepy/genomes/test2.fa") assert name == "test2" # url name = g._parse_name( "http://ftp.xenbase.org/pub/Genomics/JGI/Xentr9.1/XT9_1.fa.gz") assert name == "XT9_1"
def test__update_assembly_accession(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) # genome not found metadata = {} g._update_assembly_accession(metadata) assert metadata["assembly_accession"] == "na" # genome found metadata = {} provider = ProviderBase.create("NCBI") genome = provider.genomes.get("ASM14646v1") g._update_assembly_accession(metadata, provider, genome) assert metadata["assembly_accession"] == "GCA_000146465.1"
def test__update_tax_id(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) # genome not found metadata = {} g._update_tax_id(metadata) assert metadata["tax_id"] == "na" # genome found metadata = {} provider = ProviderBase.create("NCBI") genome = provider.genomes.get("ASM14646v1") g._update_tax_id(metadata, provider, genome) assert metadata["tax_id"] == "58839"
def test__update_provider(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) # can't parse url metadata = {} g._update_provider(metadata) assert metadata.get("provider") == "Unknown" # can parse url metadata = { "genome url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/465/" "GCF_000146465.1_ASM14646v1/GCF_000146465.1_ASM14646v1_genomic.fna.gz" } g._update_provider(metadata) assert metadata.get("provider") == "NCBI"
def test_url_genome(): """Test URL. Download S. cerevisiae genome directly from an url from UCSC and retrieve a specific sequence. """ tmp = mkdtemp() genomepy.install_genome( "http://hgdownload.soe.ucsc.edu/goldenPath/ce11/bigZips/chromFa.tar.gz", "url", genome_dir=tmp, localname="url_test", ) g = genomepy.Genome("url_test", genome_dir=tmp) assert str(g["chrI"][:12]).lower() == "gcctaagcctaa" shutil.rmtree(tmp)
def test__regions_to_seqs(genome="tests/data/small_genome.fa.gz", track="tests/data/regions.txt"): g = genomepy.Genome(genome) # extract sequences marked in regions.bed from small_genome.fa.gz seqs = g._regions_to_seqs(track=track, extend_up=0, extend_down=0) for i, seq in enumerate(seqs): assert seq.name == ["chrI:10-20", "chrII:20-30"][i] assert seq.seq == ["CCCACACACC", "TCCTCCAAGC"][i] # extend by varying amounts seqs = g._regions_to_seqs(track=track, extend_up=1, extend_down=2) for i, seq in enumerate(seqs): assert seq.name == ["chrI:10-20", "chrII:20-30"][i] # original: "CCCACACACC", "TCCTCCAAGC" assert seq.seq == ["ACCCACACACCCA", "CTCCTCCAAGCCC"][i]
def test_get_random_sequences(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) n = 2 length = 200 # default chroms = ["chrI", "chrII"] max_n = 0.1 # default rs = g.get_random_sequences(n=n, length=length, chroms=chroms, max_n=max_n) # check that the output has the right length, content, types, and sequence length assert len(rs) == n for i in range(n): assert rs[i][0] in chroms assert (isinstance(rs[i][0], str) and isinstance(rs[i][1], int) and isinstance(rs[i][2], int)) assert rs[i][2] - rs[i][1] == length # check that the max Ns are lower than the expected cutoff rs = g.get_random_sequences(n=1, chroms=chroms, outtype="string") assert str(g.track2fasta( rs[0])[0].seq).upper().count("N") <= length * max_n
def test__read_metadata(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) # no readme found readme = g.readme_file if os.path.exists(readme): os.unlink(readme) metadata = g._read_metadata() assert metadata["provider"] == "na" # no overwrites to metadata with open(readme, "w") as f: f.writelines("provider: not_really_NCBI\n") f.writelines("tax_id: not_really_58839\n") f.writelines("assembly_accession: not_really_GCA_000146465.1\n") metadata = g._read_metadata() assert metadata["provider"] == "not_really_NCBI" # updates to metadata dict and file with open(readme, "w") as f: f.writelines( "genome url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/" "146/465/GCF_000146465.1_ASM14646v1/" "GCF_000146465.1_ASM14646v1_genomic.fna.gz\n") f.writelines("tax_id: not_really_58839\n") f.writelines("assembly_accession: not_really_GCA_000146465.1\n") metadata1 = g._read_metadata() assert metadata1["provider"] == "NCBI" metadata2, _ = genomepy.utils.read_readme(readme) assert metadata2["provider"] == "NCBI" # no writing permission to file with open(readme, "w") as f: f.writelines( "genome url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/" "146/465/GCF_000146465.1_ASM14646v1/" "GCF_000146465.1_ASM14646v1_genomic.fna.gz\n") os.chmod(readme, S_IREAD | S_IRGRP | S_IROTH) metadata1 = g._read_metadata() assert metadata1["provider"] == "na" os.unlink(readme)
def test_track2fasta(genome="tests/data/small_genome.fa.gz"): tracks = [ ("tests/data/regions.txt", "interval"), ("tests/data/regions.bed", "bed"), ] g = genomepy.Genome(genome) for i, track in enumerate(tracks): seq = g.track2fasta( track=track[0], fastafile=None, stranded=False, extend_up=i, extend_down=i + 1, ) # default sequence: CCCACACACC if i == 0: # extend up +0, down -1 assert seq[0].seq == "CCCACACACCC" assert seq[1].seq == "TCCTCCAAGCC" else: # extend up +1, down -4 assert seq[0].seq == "ACCCACACACCCA" assert seq[1].seq == "CTCCTCCAAGCCC"
def test__parse_filename(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) # unimportant # file path filename = g._parse_filename(genome) assert filename == os.path.abspath(genome) # folder path filename = g._parse_filename(os.path.dirname(genome)) assert filename == os.path.abspath(genome) # name of genome in genomes_dir os.mkdir("tests/data/small_genome") with open("tests/data/small_genome/small_genome.fa.gz", "w") as fa: fa.write("test") g.genomes_dir = "tests/data/" filename = g._parse_filename(os.path.basename(genome)) assert filename == "tests/data/small_genome/small_genome.fa.gz" shutil.rmtree("tests/data/small_genome") # genome not found with pytest.raises(FileNotFoundError): g._parse_filename("does not exist")
def genome(request): """Create a test genome and location""" name = "ce10" # Use fake name for blacklist test fafile = "tests/data/small_genome.fa.gz" genomes_dir = os.path.join(os.getcwd(), ".genomepy_plugin_tests") if os.path.exists(genomes_dir): genomepy.utils.rm_rf(genomes_dir) genome_dir = os.path.join(genomes_dir, name) genomepy.utils.mkdir_p(genome_dir) fname = os.path.join(genome_dir, f"{name}.fa.gz") copyfile(fafile, fname) # unzip genome if required if request.param == "unzipped": sp.check_call(["gunzip", fname]) # add annotation (for STAR and hisat2), but only once gtf_file = "tests/data/ce10.annotation.gtf.gz" aname = os.path.join(genome_dir, f"{name}.annotation.gtf.gz") copyfile(gtf_file, aname) return genomepy.Genome(name, genomes_dir=genomes_dir)