Ejemplo n.º 1
0
def test_genome__init__(genome="tests/data/small_genome.fa.gz"):
    # no fasta file
    with pytest.raises(FileNotFoundError):
        genomepy.Genome("empty", "tests/data/genome")

    # genome dir not found
    with pytest.raises(FileNotFoundError):
        genomepy.Genome("unknown", "unknown")

    readme = "tests/data/README.txt"
    if os.path.exists(readme):
        os.unlink(readme)

    g = genomepy.Genome(genome)
    assert g.genomes_dir == genomepy.utils.get_genomes_dir(None, False)
    assert g.name == "small_genome"
    assert g.filename == os.path.abspath(genome)
    assert g.genome_dir == os.path.dirname(g.filename)
    assert os.path.exists(g.index_file)
    assert os.path.exists(g.sizes_file)
    assert os.path.exists(g.gaps_file)
    assert isinstance(g.sizes, dict)
    assert isinstance(g.gaps, dict)
    assert g.annotation_gtf_file is None
    assert g.annotation_bed_file is None
    assert g.tax_id == g.assembly_accession == "na"
    assert isinstance(g.plugin, dict)
Ejemplo n.º 2
0
def test__bed_to_seqs(genome="tests/data/small_genome.fa.gz",
                      track="tests/data/regions.bed"):
    g = genomepy.Genome(genome)

    # extract sequences marked in regions.bed from small_genome.fa.gz
    seqs = g._bed_to_seqs(track=track,
                          stranded=False,
                          extend_up=0,
                          extend_down=0)
    for i, seq in enumerate(seqs):
        assert seq.name == ["chrI:10-20 gene_a", "chrII:20-30 gene_b"][i]
        assert seq.seq == ["CCCACACACC", "TCCTCCAAGC"][i]

    # second sequence is on the negative strand
    seqs = g._bed_to_seqs(track=track,
                          stranded=True,
                          extend_up=0,
                          extend_down=0)
    for i, seq in enumerate(seqs):
        assert seq.name == ["chrI:10-20 gene_a", "chrII:20-30 gene_b"][i]
        # original:        "CCCACACACC", "TCCTCCAAGC"
        assert seq.seq == ["CCCACACACC", "GCTTGGAGGA"][i]

    # extend by varying amounts
    seqs = g._bed_to_seqs(track=track,
                          stranded=True,
                          extend_up=1,
                          extend_down=2)
    for i, seq in enumerate(seqs):
        assert seq.name == ["chrI:10-20 gene_a", "chrII:20-30 gene_b"][i]
        # original:         "CCCACACACC",    "GCTTGGAGGA"
        assert seq.seq == ["ACCCACACACCCA", "GGCTTGGAGGAGA"][i]
Ejemplo n.º 3
0
def test__update_metadata(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)

    metadata = {"provider": "NCBI", "original name": "ASM14646v1"}
    g._update_metadata(metadata)
    assert metadata["tax_id"] == "58839"
    assert metadata["assembly_accession"] == "GCA_000146465.1"
Ejemplo n.º 4
0
def get_chromsizes_from_genomepy(
    genome,
    saveas=None,
):
    """
    Get chrom size info for *genome* from genomepy, if genomepy is installed.

    Parameters
    ----------

    genome : str
        Name of the genome assembly (e.g., "hg38")

    saveas : str
        Filename to save output to. Dictionary will still be returned.
    """
    if "genomepy" not in sys.modules:
        return None

    d = {}
    try:
        g = genomepy.Genome(genome)
        # Fail silently if the sizes file cannot be accessed
        if not hasattr(g, "sizes_file"):
            return None
        for line in open(g.sizes_file):
            chrom, size = line.split()
            d[chrom] = (0, int(size))

        if saveas is not None:
            chromsizes_to_file(d, saveas)
    except FileNotFoundError:
        return None

    return d
Ejemplo n.º 5
0
def get_ref_fasta(genome):
    if is_az():
        path = '/ngs/reference_data/genomes/Hsapiens/' + genome + '/seq/' + genome + '.fa'
        if isfile(path):
            logger.info('Found genome fasta at ' + path)
            return path

    if isdir(join(DATA_DIR, 'genomes', genome)):
        genome_dir = safe_mkdir(join(DATA_DIR, 'genomes'))
    else:
        genome_dir = safe_mkdir(join(DATA_DIR, '..', 'genomes'))
    if genome not in genomepy.list_installed_genomes(genome_dir):
        genome_rec = [
            rec for rec in genomepy.list_available_genomes()
            if rec[1] == genome
        ]
        if genome_rec:
            genome_rec = genome_rec[0]
        else:
            logger.critical('Error: genome ' + genome + ' is not available')
        logger.info('Downloading genome ' + genome + ' from ' + genome_rec[1] +
                    ' and installing into ' + genome_dir)
        genomepy.install_genome(genome, 'UCSC', genome_dir=genome_dir)
    genome_fasta_file = genomepy.Genome(genome, genome_dir=genome_dir).filename
    return genome_fasta_file
Ejemplo n.º 6
0
def test_install_genome():
    localname = "my_genome"
    genomepy.functions.install_genome(
        name="tests/data/sacCer3/sacCer3.fa",
        provider="Local",
        genomes_dir=None,
        localname=localname,
        regex="chrIV",
        annotation=True,
        force=True,
    )

    genomes_dir = genomepy.functions.get_genomes_dir(None, False)
    genome_file = os.path.join(genomes_dir, localname, localname + ".fa")
    assert os.path.exists(genome_file)
    sizes_file = os.path.join(genomes_dir, localname, localname + ".fa.sizes")
    assert os.path.exists(sizes_file)
    gaps_file = os.path.join(genomes_dir, localname, localname + ".gaps.bed")
    assert os.path.exists(gaps_file)
    annotation_file = os.path.join(genomes_dir, localname,
                                   localname + ".annotation.gtf")
    assert os.path.exists(annotation_file)

    # regex test:
    sizes = genomepy.Genome(localname).sizes.keys()
    assert "chrIV" in sizes
Ejemplo n.º 7
0
def combine_peaks(peaks, genome, window, scale_value):
    """
    Combine multiple MACS2 summit files and returns the summit
    with the maximum value.

    Parameters
    ----------
    peaks : list
        List with summit file names.

    genome : str
        Genome file name. Either a file with chromosome sizes or a genomepy
        genome name.

    window : int
        Window size. Summits will be extended to this size before merging.

    scale_value : bool
        Scale summit values before taking the maximum.

    Returns
    -------
    summits : pandas.DataFrame
        DataFrame with summits.
    """
    try:
        g = genomepy.Genome(genome)
        genome = g.sizes_file
    except Exception:
        pass

    dfs = [read_peak_file_to_df(fname) for fname in peaks]
    df_all = pd.concat(dfs)

    check_col = "log_value"
    if scale_value:
        check_col = "log_value_scaled"

    # store summit location + associated value in col4
    df_all["col4"] = (df_all["chrom"].astype(str) + ";" +
                      df_all["start"].astype(str) + ";" +
                      df_all["end"].astype(str) + ";" +
                      df_all[check_col].astype(str))

    tmp = NamedTemporaryFile(suffix=".all_peaks.bed", delete=False).name
    out = df_all[["chrom", "start", "end",
                  "col4"]].sort_values(["chrom", "start"])
    out.to_csv(tmp, sep="\t", index=False, header=False)

    b = BedTool(tmp)
    all_summits = []
    # loop over merged peaks based on window size and collapse on col4 (summit + value)
    for f in b.slop(b=window // 2, g=genome).merge(c=4, o="collapse"):
        summits = [x.split(";") for x in f[3].split(",")]
        # only keep the highest summit
        all_summits.append(sorted(summits, key=lambda x: float(x[3]))[-1][:3])

    df = pd.DataFrame(all_summits, columns=["chrom", "start", "end"])
    return df
Ejemplo n.º 8
0
def test_ucsc_genome(genome="sacCer3", provider="UCSC"):
    """Test UCSC.

    Download S. cerevisiae genome from UCSC and retrieve a specific sequence."""
    tmp = mkdtemp()
    genomepy.install_genome(genome, provider, genome_dir=tmp)
    g = genomepy.Genome(genome, genome_dir=tmp)
    seq = g["chrIV"][1337000:1337020]
    assert str(seq) == "TTTGGTTGTTCCTCTTCCTT"
Ejemplo n.º 9
0
def test_gaps(genome="tests/data/gap.fa"):
    g = genomepy.Genome(genome)
    assert list(g.gaps.keys()) == ["chr1", "chr3"]

    # does not overwrite user-set gaps
    g.gaps = {"asd": 1}
    assert g.gaps == {"asd": 1}

    # repopulates empty dicts
    g.gaps = {}
    assert list(g.gaps.keys()) == ["chr1", "chr3"]
Ejemplo n.º 10
0
def test__region_to_seq(genome="tests/data/small_genome.fa.gz",
                        region="chrI:10-20"):
    g = genomepy.Genome(genome)

    # extract sequences marked in track from small_genome.fa.gz
    seq = g._region_to_seq(region=region, extend_up=0, extend_down=0)
    assert seq == "CCCACACACC"

    # extend by varying amounts
    seq = g._region_to_seq(region=region, extend_up=1, extend_down=2)
    # original:    "CCCACACACC"
    assert seq == "ACCCACACACCCA"
Ejemplo n.º 11
0
def test_ensembl_human():
    """Test Ensembl.
    
    Download human genome from Ensembl and retrieve a 
    specific sequence.
    """
    tmp = mkdtemp()
    genomepy.install_genome("GRCh38.p10", "Ensembl", genome_dir=tmp)
    g = genomepy.Genome("GRCh38.p10", genome_dir=tmp)
    seq = g["6"][166168664:166168679]
    assert str(seq) == "CCTCCTCGCTCTCTT"
    shutil.rmtree(tmp)
Ejemplo n.º 12
0
def test_ensembl_genome():
    """Test Ensembl.
    
    Download Drosophila genome from Ensembl and retrieve a 
    specific sequence.
    """
    tmp = mkdtemp()
    genomepy.install_genome("BDGP6", "Ensembl", genome_dir=tmp)
    g = genomepy.Genome("BDGP6", genome_dir=tmp)
    seq = g["3L"][10637840:10637875]
    assert str(seq).upper() == "TTTGCAACAGCTGCCGCAGTGTGACCGTTGTACTG"
    shutil.rmtree(tmp)
Ejemplo n.º 13
0
def test_ncbi_genome(genome="ASM2732v1", provider="NCBI"):
    """Test NCBI.

    Download smallest genome from NCBI and retrieve a
    specific sequence.
    """
    tmp = mkdtemp()
    genomepy.install_genome(genome, provider, genome_dir=tmp)
    g = genomepy.Genome(genome, genome_dir=tmp)
    seq = g["ANONYMOUS"][80:107]
    assert str(seq).upper() == "ATACCTTCCTTAATACTGTTAAATTAT"
    shutil.rmtree(tmp)
Ejemplo n.º 14
0
def test_ucsc_human():
    """Test UCSC.
   
    Download human genome from UCSC and retrieve a 
    specific sequence.
    """
    tmp = mkdtemp()
    genomepy.install_genome("hg38", "UCSC", genome_dir=tmp)
    g = genomepy.Genome("hg38", genome_dir=tmp)
    seq = g["chr6"][166168664:166168679]
    assert str(seq) == "CCTCCTCGCTCTCTT"
    shutil.rmtree(tmp)
Ejemplo n.º 15
0
def test_ncbi_genome():
    """Test NCBI.
    
    Download Drosophila genome from NCBI and retrieve a 
    specific sequence.
    """
    tmp = mkdtemp()
    genomepy.install_genome("Release 6 plus ISO1 MT", "NCBI", genome_dir=tmp)
    g = genomepy.Genome("Release_6_plus_ISO1_MT", genome_dir=tmp)
    seq = g["3L"][10637840:10637875]
    assert str(seq).upper() == "TTTGCAACAGCTGCCGCAGTGTGACCGTTGTACTG"
    shutil.rmtree(tmp)
Ejemplo n.º 16
0
def test_ncbi_human():
    """Test NCBI.
    
    Download human genome from NCBI and retrieve a 
    specific sequence.
    """
    tmp = mkdtemp()
    genomepy.install_genome("GRCh38.p9", "NCBI", genome_dir=tmp)
    g = genomepy.Genome("GRCh38.p9", genome_dir=tmp)
    seq = g["6"][166168664:166168679]
    assert str(seq) == "CCTCCTCGCTCTCTT"
    shutil.rmtree(tmp)
Ejemplo n.º 17
0
def test_ensembl_genome(genome="KH", provider="Ensembl", version=98):
    """Test Ensembl.

    Download smallest genome from Ensembl's HTTPS and retrieve a specific sequence.
    """
    tmp = mkdtemp()
    # Only test on vertebrates as these are downloaded over HTTPS.
    # All others are downloaded over FTP, which is unreliable on Travis.
    genomepy.install_genome(genome, provider, genome_dir=tmp, version=version)
    g = genomepy.Genome(genome, genome_dir=tmp)
    seq = g["1"][40:60]
    assert str(seq).upper() == "nnnnnnnnnnAACCCCTAAC".upper()
    shutil.rmtree(tmp)
Ejemplo n.º 18
0
def test_sizes(genome="tests/data/gap.fa"):
    g = genomepy.Genome(genome)
    assert list(g.sizes.keys()) == ["chr1", "chr2", "chr3"]
    assert all(isinstance(g.sizes[chrom], int) for chrom in g.sizes.keys())
    assert g.sizes["chr1"] == 28

    # does not overwrite user-set sizes
    g.sizes = {"asd": 1}
    assert g.sizes == {"asd": 1}

    # repopulates empty dicts
    g.sizes = {}
    assert list(g.sizes.keys()) == ["chr1", "chr2", "chr3"]
Ejemplo n.º 19
0
def test_check_annotation_file(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)

    # does not exist
    gtf = g.check_annotation_file("gtf")
    assert gtf is None

    # does exist
    path = "tests/data/small_genome.annotation.test.gz"
    with open(path, "w") as fa:
        fa.write("test")
    test = g.check_annotation_file("test")
    assert test == os.path.abspath(path)
    os.unlink(path)
Ejemplo n.º 20
0
def test__parse_name(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)  # unimportant

    # name
    name = g._parse_name("test")
    assert name == "test"

    # file
    name = g._parse_name("/home/genomepy/genomes/test2.fa")
    assert name == "test2"

    # url
    name = g._parse_name(
        "http://ftp.xenbase.org/pub/Genomics/JGI/Xentr9.1/XT9_1.fa.gz")
    assert name == "XT9_1"
Ejemplo n.º 21
0
def test__update_assembly_accession(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)

    # genome not found
    metadata = {}
    g._update_assembly_accession(metadata)
    assert metadata["assembly_accession"] == "na"

    # genome found
    metadata = {}
    provider = ProviderBase.create("NCBI")
    genome = provider.genomes.get("ASM14646v1")

    g._update_assembly_accession(metadata, provider, genome)
    assert metadata["assembly_accession"] == "GCA_000146465.1"
Ejemplo n.º 22
0
def test__update_tax_id(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)

    # genome not found
    metadata = {}
    g._update_tax_id(metadata)
    assert metadata["tax_id"] == "na"

    # genome found
    metadata = {}
    provider = ProviderBase.create("NCBI")
    genome = provider.genomes.get("ASM14646v1")

    g._update_tax_id(metadata, provider, genome)
    assert metadata["tax_id"] == "58839"
Ejemplo n.º 23
0
def test__update_provider(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)

    # can't parse url
    metadata = {}
    g._update_provider(metadata)
    assert metadata.get("provider") == "Unknown"

    # can parse url
    metadata = {
        "genome url":
        "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/465/"
        "GCF_000146465.1_ASM14646v1/GCF_000146465.1_ASM14646v1_genomic.fna.gz"
    }
    g._update_provider(metadata)
    assert metadata.get("provider") == "NCBI"
Ejemplo n.º 24
0
def test_url_genome():
    """Test URL.

    Download S. cerevisiae genome directly from an url from UCSC and retrieve a
    specific sequence.
    """
    tmp = mkdtemp()
    genomepy.install_genome(
        "http://hgdownload.soe.ucsc.edu/goldenPath/ce11/bigZips/chromFa.tar.gz",
        "url",
        genome_dir=tmp,
        localname="url_test",
    )
    g = genomepy.Genome("url_test", genome_dir=tmp)
    assert str(g["chrI"][:12]).lower() == "gcctaagcctaa"
    shutil.rmtree(tmp)
Ejemplo n.º 25
0
def test__regions_to_seqs(genome="tests/data/small_genome.fa.gz",
                          track="tests/data/regions.txt"):
    g = genomepy.Genome(genome)

    # extract sequences marked in regions.bed from small_genome.fa.gz
    seqs = g._regions_to_seqs(track=track, extend_up=0, extend_down=0)
    for i, seq in enumerate(seqs):
        assert seq.name == ["chrI:10-20", "chrII:20-30"][i]
        assert seq.seq == ["CCCACACACC", "TCCTCCAAGC"][i]

    # extend by varying amounts
    seqs = g._regions_to_seqs(track=track, extend_up=1, extend_down=2)
    for i, seq in enumerate(seqs):
        assert seq.name == ["chrI:10-20", "chrII:20-30"][i]
        # original:         "CCCACACACC",    "TCCTCCAAGC"
        assert seq.seq == ["ACCCACACACCCA", "CTCCTCCAAGCCC"][i]
Ejemplo n.º 26
0
def test_get_random_sequences(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)
    n = 2
    length = 200  # default
    chroms = ["chrI", "chrII"]
    max_n = 0.1  # default
    rs = g.get_random_sequences(n=n, length=length, chroms=chroms, max_n=max_n)

    # check that the output has the right length, content, types, and sequence length
    assert len(rs) == n
    for i in range(n):
        assert rs[i][0] in chroms
        assert (isinstance(rs[i][0], str) and isinstance(rs[i][1], int)
                and isinstance(rs[i][2], int))
        assert rs[i][2] - rs[i][1] == length

    # check that the max Ns are lower than the expected cutoff
    rs = g.get_random_sequences(n=1, chroms=chroms, outtype="string")
    assert str(g.track2fasta(
        rs[0])[0].seq).upper().count("N") <= length * max_n
Ejemplo n.º 27
0
def test__read_metadata(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)

    # no readme found
    readme = g.readme_file
    if os.path.exists(readme):
        os.unlink(readme)
    metadata = g._read_metadata()
    assert metadata["provider"] == "na"

    # no overwrites to metadata
    with open(readme, "w") as f:
        f.writelines("provider: not_really_NCBI\n")
        f.writelines("tax_id: not_really_58839\n")
        f.writelines("assembly_accession: not_really_GCA_000146465.1\n")
    metadata = g._read_metadata()
    assert metadata["provider"] == "not_really_NCBI"

    # updates to metadata dict and file
    with open(readme, "w") as f:
        f.writelines(
            "genome url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/"
            "146/465/GCF_000146465.1_ASM14646v1/"
            "GCF_000146465.1_ASM14646v1_genomic.fna.gz\n")
        f.writelines("tax_id: not_really_58839\n")
        f.writelines("assembly_accession: not_really_GCA_000146465.1\n")
    metadata1 = g._read_metadata()
    assert metadata1["provider"] == "NCBI"
    metadata2, _ = genomepy.utils.read_readme(readme)
    assert metadata2["provider"] == "NCBI"

    # no writing permission to file
    with open(readme, "w") as f:
        f.writelines(
            "genome url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/"
            "146/465/GCF_000146465.1_ASM14646v1/"
            "GCF_000146465.1_ASM14646v1_genomic.fna.gz\n")
    os.chmod(readme, S_IREAD | S_IRGRP | S_IROTH)
    metadata1 = g._read_metadata()
    assert metadata1["provider"] == "na"
    os.unlink(readme)
Ejemplo n.º 28
0
def test_track2fasta(genome="tests/data/small_genome.fa.gz"):
    tracks = [
        ("tests/data/regions.txt", "interval"),
        ("tests/data/regions.bed", "bed"),
    ]
    g = genomepy.Genome(genome)

    for i, track in enumerate(tracks):
        seq = g.track2fasta(
            track=track[0],
            fastafile=None,
            stranded=False,
            extend_up=i,
            extend_down=i + 1,
        )

        # default sequence:       CCCACACACC
        if i == 0:  # extend up +0, down -1
            assert seq[0].seq == "CCCACACACCC"
            assert seq[1].seq == "TCCTCCAAGCC"
        else:  # extend up +1, down -4
            assert seq[0].seq == "ACCCACACACCCA"
            assert seq[1].seq == "CTCCTCCAAGCCC"
Ejemplo n.º 29
0
def test__parse_filename(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)  # unimportant

    # file path
    filename = g._parse_filename(genome)
    assert filename == os.path.abspath(genome)

    # folder path
    filename = g._parse_filename(os.path.dirname(genome))
    assert filename == os.path.abspath(genome)

    # name of genome in genomes_dir
    os.mkdir("tests/data/small_genome")
    with open("tests/data/small_genome/small_genome.fa.gz", "w") as fa:
        fa.write("test")
    g.genomes_dir = "tests/data/"
    filename = g._parse_filename(os.path.basename(genome))
    assert filename == "tests/data/small_genome/small_genome.fa.gz"
    shutil.rmtree("tests/data/small_genome")

    # genome not found
    with pytest.raises(FileNotFoundError):
        g._parse_filename("does not exist")
Ejemplo n.º 30
0
def genome(request):
    """Create a test genome and location"""
    name = "ce10"  # Use fake name for blacklist test
    fafile = "tests/data/small_genome.fa.gz"

    genomes_dir = os.path.join(os.getcwd(), ".genomepy_plugin_tests")
    if os.path.exists(genomes_dir):
        genomepy.utils.rm_rf(genomes_dir)
    genome_dir = os.path.join(genomes_dir, name)
    genomepy.utils.mkdir_p(genome_dir)
    fname = os.path.join(genome_dir, f"{name}.fa.gz")
    copyfile(fafile, fname)

    # unzip genome if required
    if request.param == "unzipped":
        sp.check_call(["gunzip", fname])

        # add annotation (for STAR and hisat2), but only once
        gtf_file = "tests/data/ce10.annotation.gtf.gz"
        aname = os.path.join(genome_dir, f"{name}.annotation.gtf.gz")
        copyfile(gtf_file, aname)

    return genomepy.Genome(name, genomes_dir=genomes_dir)