Ejemplo n.º 1
0
def small_dataset():
    with open("tests/data/small_dataset.csv") as f:
        df = pd.read_csv(f)
        from oats.biology.dataset import Dataset
        dataset = Dataset()
        dataset.add_data(df)
        return (dataset)
Ejemplo n.º 2
0
    truncated_text = text[:char_limit]
    if len(text) > char_limit:
        truncated_text = "{}...".format(truncated_text)
    return (truncated_text)


def truncate_list(list_, item_limit):
    truncated_list = list_[:item_limit]
    if len(list_) > item_limit:
        truncated_list.append("...")
    return (truncated_list)


# Load the dataset that was created and saved in the previous step in the pipeline.
path = "../genes_texts_annots.csv"
dataset = Dataset(path)

# Output this dataset as a new json file.
json_data = dataset.to_json()
json_path = "../genes_texts_annots.json"
with open(json_path, "w") as f:
    json.dump(json_data, f, indent=4)

# Create a sample version of the file by truncating some of the strings and lists.
json_data = dataset.to_json()
json_path = "../genes_texts_annots_sample.json"

# Subset both the number of entries in the dataset and truncate information in each field.
list_limit = 4
description_char_limit = 100
num_genes = 100
Ejemplo n.º 3
0
from oats.nlp.vocabulary import get_overrepresented_tokens, get_vocab_from_tokens
from oats.nlp.vocabulary import reduce_vocab_connected_components, reduce_vocab_linares_pontes, token_enrichment

warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# In[2]:

# Paths to the files that are used for this notebook.
dataset_path = "../../quoats/data/genes_texts_annots.csv"
dataset = Dataset(dataset_path, keep_ids=True)
dataset.describe()

# ### Anthocyanin biosynthesis genes

# Previously out of this list we had 10 of 16 maize genes in the dataset and 16 of 18 Aribidopsis genes. Now we have 13 of 16 maize genes and 18 of 18 Arabidopsis genes.

# In[ ]:

mapping = dataset.get_species_to_name_to_ids_dictionary(include_synonyms=False,
                                                        lowercase=True)
genes = pd.read_csv("anthocyanin_biosynthesis_genes.csv")
genes["id"] = genes.apply(lambda x: mapping[x["species_code"]].get(
    x["identifier"].strip().lower(), -1),
                          axis=1)
genes[genes["id"] != -1]["id"] = genes[genes["id"] != -1]["id"].map(
# In[3]:

# Create and name an output directory according to when the notebooks was run.
OUTPUT_NAME = "composition"
OUTPUT_DIR = os.path.join(
    "../outputs",
    "{}_{}_{}".format(OUTPUT_NAME,
                      datetime.datetime.now().strftime('%m_%d_%Y_h%Hm%Ms%S'),
                      random.randrange(1000, 9999)))
os.mkdir(OUTPUT_DIR)

# In[4]:

# Reading in and describing the dataset of plant genes.
plant_dataset = Dataset(plant_dataset_path)
plant_dataset.filter_has_description()
plant_dataset.describe()

# ### What's there for each species?
# The previously loaded dataset contains all of the genes that across six plant species that have natural language description data for phenotype(s) related to that gene. Each gene can have multiple descriptions annotated to it, which were combined or concatenated when the datasets from multiple sources were merged in creating the pickled datasets. Arabidopsis has the highest number of genes that satisfy this criteria, followed by maize, and then followed by the other four species which have a relatively low number of genes that satisfy this criteria, atleast given the sources used for this work. Note that the number of unique descriptions is lower than the number of genes in call cases, because multiple genes can have the same phenotype description associated with them.

# In[5]:

data = plant_dataset

wnl = WordNetLemmatizer()
lemmatize_doc = lambda d: [wnl.lemmatize(x) for x in simple_preprocess(d)]

dists = defaultdict(list)
Ejemplo n.º 5
0
import sys
import pandas as pd
import numpy as np
import glob
import os
import warnings
warnings.simplefilter('ignore')
sys.path.append("../../oats")
from oats.biology.dataset import Dataset

path = "../genes_texts_annots.csv"

# Recreating the dataset object from the saved csv file.
# Double checking for things that we know should be true about the descriptions if preprocessing works as intended.
df = Dataset(path).to_pandas()
descriptions = df["descriptions"].values
assert len([s for s in descriptions if "|" in s]) == 0
assert len([s for s in descriptions if "  " in s]) == 0

# Check again this time reading in the dataframe directly from the csv file.
# Double checking for things that we know should be true about the descriptions if preprocessing works as intended.
df = pd.read_csv(path)
descriptions = df["descriptions"].values
assert len([s for s in descriptions if "|" in s]) == 0
assert len([s for s in descriptions if "  " in s]) == 0

print("completed check of genes_texts_annots file")
Ejemplo n.º 6
0
def test_json():
    input_data = datasets["simple_dataset"]
    dataset = Dataset(data=input_data, keep_ids=True)
    import json
    with open("/Users/irbraun/Desktop/testing.json", "w") as f:
        json.dump(dataset.to_json(), f, indent=4)
Ejemplo n.º 7
0
def test_collapsing_by_all_gene_names(input_data, expected, case_sensitive):

    # Using the constructor and retaining the original size, then collapsing by calling the hidden method.
    # This is not the intended way to do this but it should always work.
    dataset = Dataset(data=input_data, keep_ids=True)
    dataset._collapse_by_all_gene_names(case_sensitive)
    assert dataset.to_pandas().shape[0] == expected

    # Create a blank dataset then add this information to it, and it should be automatically collapsed.
    # This is one of the intended ways to do this.
    dataset = Dataset()
    dataset.add_data(new_data=input_data, case_sensitive=case_sensitive)
    assert dataset.to_pandas().shape[0] == expected

    # Use the constructor and don't specify that IDs should be kept, so it gets automatically collapsed.
    # This is one of the intended ways to do this.
    dataset = Dataset(data=input_data,
                      keep_ids=False,
                      case_sensitive=case_sensitive)
    assert dataset.to_pandas().shape[0] == expected
Ejemplo n.º 8
0
def test_reading_in_data(input_data, expected):

    # Using the constructor method and retaining the original IDs (and therefore size) of the dataset.
    dataset = Dataset(data=input_data, keep_ids=True)
    assert dataset.to_pandas().shape[0] == expected