コード例 #1
 def _remove_duplicate_names(row):
     gene_names = row["unique_gene_identifiers"].split("|")
     gene_synonyms = row["other_gene_identifiers"].split("|")
     updated_gene_synonyms = [
         x for x in gene_synonyms if x not in gene_names
     gene_synonyms_str = concatenate_with_delim("|", updated_gene_synonyms)
     return (gene_synonyms_str)
コード例 #2
def clean_oryzabase_symbol_synonyms(string):
    # Should be applied to the gene symbol synonym(s) column in the dataset.
    # Returns a single string representing a bar delimited list of gene symbols.
    string = string.replace("*","")
    names = string.split(",")
    names = [name.strip() for name in names]
    names = [remove_enclosing_brackets(name) for name in names]
    names_string = concatenate_with_delim("|", names)
コード例 #3
def clean_oryzabase_symbol(string):
    # Should be applied to the gene symbol column in the dataset.
    # Returns a single string representing a bar delimited list of gene symbols.
    string = string.replace("*","")
    names = handle_synonym_in_parentheses(string, min_length=4)
    names = [remove_enclosing_brackets(name) for name in names]
    names = [name for name in names if len(name)>=2] # Retain only names that are atleast two characters.
    names_string = concatenate_with_delim("|", names)
コード例 #4
 def _reorder_unique_gene_identifers(row):
     unique_identifiers = row["unique_gene_identifiers"].split("|")
     gene_models = row["gene_models"].split("|")
     reordered_unique_identifiers = [
         x for x in unique_identifiers if x not in gene_models
     reordered_unique_identifiers_str = concatenate_with_delim(
         "|", reordered_unique_identifiers)
     return (reordered_unique_identifiers_str)
コード例 #5
ファイル: groupings.py プロジェクト: irbraun/oats
    def get_dataframe_for_plantcyc(paths):

        dfs_for_each_species = []
        for species_code, pathways_filepath in paths.items():

            usecols = [
                "Pathway-id", "Pathway-name", "Reaction-id", "EC",
                "Protein-id", "Protein-name", "Gene-id", "Gene-name"
            usenames = [
                "pathway_id", "pathway_name", "reaction_id", "ec_number",
                "protein_id", "protein_name", "gene_id", "gene_name"
            renamed = {k: v for k, v in zip(usecols, usenames)}
            df = pd.read_table(pathways_filepath, usecols=usecols)
            df.rename(columns=renamed, inplace=True)
            df.fillna("", inplace=True)

            # Note, manually reviewed the conventions in gene names for the PlantCyc dataset.
            # The string "unknown" is used for missing values, don't add this as a gene name.
            df.replace(to_replace="unknown", value="", inplace=True)
            combine_columns = lambda row, columns: concatenate_with_delim(
                "|", [row[col] for col in columns])
            df["gene_identifiers"] = df.apply(lambda x: combine_columns(
                x, ["protein_id", "protein_name", "gene_id", "gene_name"]),

            # Some other manipulations to clean up the data, based on how missing value are specified in the PlantCyc Files.
            # Don't retain rows where no gene names are referenced.
            df = df[df["gene_identifiers"] != ""]
            df["ec_number"] = df["ec_number"].map(lambda x: ""
                                                  if x == "-" else x)
            df["group_ids"] = df["pathway_id"]
            df["species"] = species_code
            df = df[[
                "species", "group_ids", "gene_identifiers", "pathway_id",
                "pathway_name", "ec_number"

        df = pd.concat(dfs_for_each_species)
        return (df)
コード例 #6
    def _collapse_by_all_gene_names(self, case_sensitive=False):
        """Merges all the records where the species and any of the listed gene names or identifiers match. Text descriptions 
		are concatenated and a union of the gene names and ontology term IDs are retained.
			case_sensitive (bool, optional): Set to true if gene names that only differ in terms of case
			should be treated as different genes, by default these genes are considered to be the same gene.

        # Build the graph model of this data where nodes are gene names or IDs.
        g = nx.Graph()
        edges = self.df.apply(Dataset._generate_edges,
        edges = list(chain.from_iterable(edges.values))

        # Get the connected components of the graph, a list of lists of nodes. Each component will
        # always have one or more ID's in it as (a) node(s), because every gene name has to be
        # associated with atleast one ID, corresponding to the row that it was in in the input data.
        components = nx.connected_components(g)

        # Build a dictionary mapping node names (IDs and gene names) to component values.
        # Is there a more efficient way to get a node to component mapping? There probably
        # should be we don't need everything to be in the mapping, just one thing from each
        # entry, which could be just the ID values, don't need any of the gene names. This is
        # because all node mentioned in one entry will always be in the same component. One
        # solution would be to sort all the lists of nodes then just take the first, if we
        # could make sure the ID value would always be first? That might be even slower,
        # depends on ratio of number of entries to number of names?
        node_to_component = {}
        component_index = 0
        for node_set in components:
            for node in node_set:
                node_to_component[node] = component_index
            component_index = component_index + 1

        # Create a new column that indicates which connected component that entry maps to.
        self.df["component"] = self.df["id"].map(node_to_component)

        # Groupy by the connected component column and merge the other fields appropriately.
        self.df = self.df.groupby("component").agg({
            lambda x: x.values[0],
            lambda x: concatenate_with_delim("|", x),
            lambda x: concatenate_with_delim("|", x),
            lambda x: concatenate_with_delim("|", x),
            lambda x: concatenate_texts(x),
            lambda x: concatenate_with_delim("|", x),
            lambda x: concatenate_with_delim("|", x)

        # Merging may have resulted in names or identifers being considered by gene names and synonyms.
        # Remove them from the synonym list if they are in the gene name list.
        self.df["other_gene_identifiers"] = self.df.apply(
            lambda x: Dataset._remove_duplicate_names(x), axis=1)
        self.df["unique_gene_identifiers"] = self.df.apply(
            lambda x: Dataset._reorder_unique_gene_identifers(x), axis=1)

        # Reset the ID values in the dataset to reflect this change.
        self.df["id"] = None
コード例 #7
         range=(0, 50),
fig.set_size_inches(15, 4)

# In[10]:

# Organizing the desired information into a standard set of column headers.
combine_columns = lambda row, columns: concatenate_with_delim(
    "|", [row[column] for column in columns])
df["species_code"] = "sly"
df["species_name"] = "tomato"
df["text_unprocessed"] = df["allele_phenotype"]
df["unique_gene_identifiers"] = df.apply(
    lambda x: combine_columns(x, ["locus", "locus_symbol"]), axis=1)
df["other_gene_identifiers"] = df.apply(lambda x: combine_columns(
    x, ["locus_name", "allele_symbol", "allele_name"]),
df["gene_models"] = df["locus"]
df["annotations"] = ""
df["reference_name"] = "SGN"
df["reference_link"] = "https://solgenomics.net/"
df["reference_file"] = "sgn_tomato_phenotyped_loci.txt"
df = df[reshaped_columns]
コード例 #8
def clean_oryzabase_explainations(string):
    # Should be applied to the explaniation column in the dataset.
    # Returns a version of the value in that column without some of the redundant information.
    ontology_ids = get_ontology_ids(string)
    for ontology_id in ontology_ids:
        string = string.replace(ontology_id,"")
        string = remove_punctuation(string)

# In[14]:

# Restructuring and combining columns that have gene name information.
combine_columns = lambda row, columns: concatenate_with_delim("|", [row[column] for column in columns])
df["CGSNL Gene Symbol"] = df["CGSNL Gene Symbol"].apply(clean_oryzabase_symbol)
df["Gene symbol synonym(s)"] = df["Gene symbol synonym(s)"].apply(clean_oryzabase_symbol_synonyms)
df["CGSNL Gene Name"] = df["CGSNL Gene Name"].apply(lambda x: x.replace("_","").strip())
df["Gene name synonym(s)"] = df["Gene name synonym(s)"].apply(lambda x: replace_delimiter(text=x, old_delim=",", new_delim="|"))
df["gene_names"] = df.apply(lambda x: combine_columns(x, ["RAP ID","MUS ID","CGSNL Gene Symbol", "Gene symbol synonym(s)", "CGSNL Gene Name", "Gene name synonym(s)"]), axis=1)

# Restructuring and combining columns that have ontology term annotations.
df["Gene Ontology"] = df["Gene Ontology"].apply(lambda x: concatenate_with_delim("|", get_ontology_ids(x)))
df["Trait Ontology"] = df["Trait Ontology"].apply(lambda x: concatenate_with_delim("|", get_ontology_ids(x))) 
df["Plant Ontology"] = df["Plant Ontology"].apply(lambda x: concatenate_with_delim("|", get_ontology_ids(x))) 
df["term_ids"] = df.apply(lambda x: combine_columns(x, ["Gene Ontology","Trait Ontology","Plant Ontology"]), axis=1)

# Adding other expected columns and subsetting the dataset.
df["species"] = "osa"
df["description"] = df["Explanation"].apply(clean_oryzabase_explainations)
コード例 #9
ファイル: combining.py プロジェクト: irbraun/plant-data
# In[5]:


# In[6]:

# Now the dataset looks clean, but the gene columns don't yet reflect all the information
# that was used in the network creation step. For example, we want all the gene identifiers
# found on any row to be present everywhere in the dataset for that given line.

# In[7]:

agg_df = df.groupby("_gene_id").agg({
    lambda x: concatenate_with_delim("|", x),
    lambda x: concatenate_with_delim("|", x),
    lambda x: concatenate_with_delim("|", x)

# In[8]:

# This is only called by collapse_by_all_gene_names().
# A method necessary for cleaning up lists of gene identifiers after merging.
# This removes things from the other gene identifiers if they are already listed as a unique gene identifier.
# This could happen after merging if some string was unsure about being a unique identifier, but some other entry confirms that is is.
def remove_duplicate_names(row):
    gene_names = row["unique_gene_identifiers"].split("|")