Esempio n. 1
0
def create_attributes():
    """Create and store the pickled attributes dataframes."""
    df1 = apy.find_attributes("hsapiens_gene_ensembl")
    df1.to_pickle(os.path.join(DATADIR,
                               "attributes_hsapiens_gene_ensembl.pkl"))
    df4 = apy.find_attributes("closure_ECO")
    df4.to_pickle(os.path.join(DATADIR, "attributes_closure_ECO.pkl"))
    df5 = apy.find_attributes("hsapiens_encode")
    df5.to_pickle(os.path.join(DATADIR, "attributes_hsapiens_encode.pkl"))
    df6 = apy.find_attributes("chircus_snp")
    df6.to_pickle(os.path.join(DATADIR, "attributes_chircus_snp.pkl"))
    df7 = apy.find_attributes("hsapiens_peak")
    df7.to_pickle(os.path.join(DATADIR, "attributes_hsapiens_peak.pkl"))
Esempio n. 2
0
def test_find_attributes_genomic(df_attributes_genomic_hsapiens_encode):
    """Test the available attributes returned by find_attributes() for the
    hsapiens_encode dataset."""
    expect = (df_attributes_genomic_hsapiens_encode.sort_values(
        by="Attribute_ID", axis=0).reset_index(drop=True))
    result = (find_attributes("hsapiens_encode").sort_values(
        by="Attribute_ID", axis=0).reset_index(drop=True))

    assert_frame_equal(result, expect)
Esempio n. 3
0
def test_find_attributes_snp(df_attributes_snp_chircus_snp):
    """Test the available attributes returned by find_attributes() for the
    chircus_snp dataset."""
    expect = (df_attributes_snp_chircus_snp.sort_values(
        by="Attribute_ID", axis=0).reset_index(drop=True))
    result = (find_attributes("chircus_snp").sort_values(
        by="Attribute_ID", axis=0).reset_index(drop=True))

    assert_frame_equal(result, expect)
Esempio n. 4
0
def test_find_attributes_ontology(df_attributes_ontology_closure_eco):
    """Test the available attributes returned by find_attributes() for the
    closure_ECO dataset."""
    expect = (df_attributes_ontology_closure_eco.sort_values(
        by="Attribute_ID", axis=0).reset_index(drop=True))
    result = (find_attributes("closure_ECO").sort_values(
        by="Attribute_ID", axis=0).reset_index(drop=True))

    assert_frame_equal(result, expect)
Esempio n. 5
0
def test_find_attributes_default(df_attributes_ensembl_hsapiens_gene):
    """Test the available attributes returned by find_attributes() for the
    default dataset (hsapiens_gene_ensembl)."""
    expect = (df_attributes_ensembl_hsapiens_gene.sort_values(
        by="Attribute_ID", axis=0).reset_index(drop=True))
    result = (find_attributes().sort_values(by="Attribute_ID",
                                            axis=0).reset_index(drop=True))

    assert_frame_equal(result, expect)
Esempio n. 6
0
def test_find_attributes_funcgen(df_attributes_funcgen_hsapiens_peak):
    """Test the available attributes returned by find_attributes() for the
    hsapiens_peak dataset."""
    expect = (df_attributes_funcgen_hsapiens_peak.sort_values(
        by="Attribute_ID", axis=0).reset_index(drop=True))
    result = (find_attributes("hsapiens_peak").sort_values(
        by="Attribute_ID", axis=0).reset_index(drop=True))

    assert_frame_equal(result, expect)
Esempio n. 7
0
def test_find_attributes_output(df_attributes_ensembl_hsapiens_gene):
    """Test the available attributes returned by find_attributes with a given
    filename for the default dataset (hsapiens_gene_ensembl)."""
    expect = (df_attributes_ensembl_hsapiens_gene.sort_values(
        by="Attribute_ID", axis=0).reset_index(drop=True))
    _ = find_attributes(save=True, output="tested.csv")
    saved = pd.read_csv("tested.csv")
    result = (saved.replace(pd.np.nan,
                            "").sort_values(by="Attribute_ID",
                                            axis=0).reset_index(drop=True))

    try:
        assert_frame_equal(result, expect)
    finally:
        os.remove("tested.csv")
Esempio n. 8
0
def pull_ensembl(complete_file):
    f = find_datasets()
    cols = set([
        "ensembl_gene_id", "ensembl_peptide_id", "description",
        "external_gene_name", "external_gene_source", "external_synonym",
        "chromosome_name", "source", "gene_biotype", "entrezgene_id",
        "zfin_id_id", 'mgi_id', 'rgd_id', 'flybase_gene_id', 'sgd_gene',
        'wormbase_gene'
    ])
    for ds in f['Dataset_ID']:
        print(ds)
        outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}')
        #Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our
        # config, and keep it up to date.  Maybe you could have a job that gets the datasets and writes a dataset file,
        # but then updates the config? That sounds bogus.
        if os.path.exists(outfile):
            continue
        atts = find_attributes(ds)
        existingatts = set(atts['Attribute_ID'].to_list())
        attsIcanGet = cols.intersection(existingatts)
        df = query(attributes=attsIcanGet, filters={}, dataset=ds)
        df.to_csv(outfile, index=False, sep='\t')
    with open(complete_file, 'w') as outf:
        outf.write(f'Downloaded gene sets for {len(f)} data sets.')