Ejemplo n.º 1
0
 def _remove_duplicate_names(row):
     gene_names = row["unique_gene_identifiers"].split("|")
     gene_synonyms = row["other_gene_identifiers"].split("|")
     updated_gene_synonyms = [
         x for x in gene_synonyms if x not in gene_names
     ]
     gene_synonyms_str = concatenate_with_delim("|", updated_gene_synonyms)
     return (gene_synonyms_str)
Ejemplo n.º 2
0
def clean_oryzabase_symbol_synonyms(string):
    # Should be applied to the gene symbol synonym(s) column in the dataset.
    # Returns a single string representing a bar delimited list of gene symbols.
    string = string.replace("*","")
    names = string.split(",")
    names = [name.strip() for name in names]
    names = [remove_enclosing_brackets(name) for name in names]
    names_string = concatenate_with_delim("|", names)
    return(names_string)
Ejemplo n.º 3
0
def clean_oryzabase_symbol(string):
    # Should be applied to the gene symbol column in the dataset.
    # Returns a single string representing a bar delimited list of gene symbols.
    string = string.replace("*","")
    names = handle_synonym_in_parentheses(string, min_length=4)
    names = [remove_enclosing_brackets(name) for name in names]
    names = [name for name in names if len(name)>=2] # Retain only names that are atleast two characters.
    names_string = concatenate_with_delim("|", names)
    return(names_string)
Ejemplo n.º 4
0
 def _reorder_unique_gene_identifers(row):
     unique_identifiers = row["unique_gene_identifiers"].split("|")
     gene_models = row["gene_models"].split("|")
     reordered_unique_identifiers = [
         x for x in unique_identifiers if x not in gene_models
     ]
     reordered_unique_identifiers.extend(gene_models)
     reordered_unique_identifiers_str = concatenate_with_delim(
         "|", reordered_unique_identifiers)
     return (reordered_unique_identifiers_str)
Ejemplo n.º 5
0
    def get_dataframe_for_plantcyc(paths):

        dfs_for_each_species = []
        for species_code, pathways_filepath in paths.items():

            usecols = [
                "Pathway-id", "Pathway-name", "Reaction-id", "EC",
                "Protein-id", "Protein-name", "Gene-id", "Gene-name"
            ]
            usenames = [
                "pathway_id", "pathway_name", "reaction_id", "ec_number",
                "protein_id", "protein_name", "gene_id", "gene_name"
            ]
            renamed = {k: v for k, v in zip(usecols, usenames)}
            df = pd.read_table(pathways_filepath, usecols=usecols)
            df.rename(columns=renamed, inplace=True)
            df.fillna("", inplace=True)

            # Note, manually reviewed the conventions in gene names for the PlantCyc dataset.
            # The string "unknown" is used for missing values, don't add this as a gene name.
            df.replace(to_replace="unknown", value="", inplace=True)
            combine_columns = lambda row, columns: concatenate_with_delim(
                "|", [row[col] for col in columns])
            df["gene_identifiers"] = df.apply(lambda x: combine_columns(
                x, ["protein_id", "protein_name", "gene_id", "gene_name"]),
                                              axis=1)

            # Some other manipulations to clean up the data, based on how missing value are specified in the PlantCyc Files.
            # Don't retain rows where no gene names are referenced.
            df = df[df["gene_identifiers"] != ""]
            df["ec_number"] = df["ec_number"].map(lambda x: ""
                                                  if x == "-" else x)
            df["group_ids"] = df["pathway_id"]
            df["species"] = species_code
            df = df[[
                "species", "group_ids", "gene_identifiers", "pathway_id",
                "pathway_name", "ec_number"
            ]]
            dfs_for_each_species.append(df)

        df = pd.concat(dfs_for_each_species)
        return (df)
Ejemplo n.º 6
0
    def _collapse_by_all_gene_names(self, case_sensitive=False):
        """Merges all the records where the species and any of the listed gene names or identifiers match. Text descriptions 
		are concatenated and a union of the gene names and ontology term IDs are retained.
		
		Args:
			case_sensitive (bool, optional): Set to true if gene names that only differ in terms of case
			should be treated as different genes, by default these genes are considered to be the same gene.
		"""

        # Build the graph model of this data where nodes are gene names or IDs.
        g = nx.Graph()
        edges = self.df.apply(Dataset._generate_edges,
                              case_sensitive=case_sensitive,
                              axis=1)
        edges = list(chain.from_iterable(edges.values))
        g.add_edges_from(edges)

        # Get the connected components of the graph, a list of lists of nodes. Each component will
        # always have one or more ID's in it as (a) node(s), because every gene name has to be
        # associated with atleast one ID, corresponding to the row that it was in in the input data.
        components = nx.connected_components(g)

        # Build a dictionary mapping node names (IDs and gene names) to component values.
        # Is there a more efficient way to get a node to component mapping? There probably
        # should be we don't need everything to be in the mapping, just one thing from each
        # entry, which could be just the ID values, don't need any of the gene names. This is
        # because all node mentioned in one entry will always be in the same component. One
        # solution would be to sort all the lists of nodes then just take the first, if we
        # could make sure the ID value would always be first? That might be even slower,
        # depends on ratio of number of entries to number of names?
        node_to_component = {}
        component_index = 0
        for node_set in components:
            for node in node_set:
                node_to_component[node] = component_index
            component_index = component_index + 1

        # Create a new column that indicates which connected component that entry maps to.
        self.df["component"] = self.df["id"].map(node_to_component)

        # Groupy by the connected component column and merge the other fields appropriately.
        self.df = self.df.groupby("component").agg({
            "species":
            lambda x: x.values[0],
            "unique_gene_identifiers":
            lambda x: concatenate_with_delim("|", x),
            "other_gene_identifiers":
            lambda x: concatenate_with_delim("|", x),
            "gene_models":
            lambda x: concatenate_with_delim("|", x),
            "descriptions":
            lambda x: concatenate_texts(x),
            "annotations":
            lambda x: concatenate_with_delim("|", x),
            "sources":
            lambda x: concatenate_with_delim("|", x)
        })

        # Merging may have resulted in names or identifers being considered by gene names and synonyms.
        # Remove them from the synonym list if they are in the gene name list.
        self.df["other_gene_identifiers"] = self.df.apply(
            lambda x: Dataset._remove_duplicate_names(x), axis=1)
        self.df["unique_gene_identifiers"] = self.df.apply(
            lambda x: Dataset._reorder_unique_gene_identifers(x), axis=1)

        # Reset the ID values in the dataset to reflect this change.
        self.df["id"] = None
        self._reset_ids()
        self._update_dictionaries()
Ejemplo n.º 7
0
         bins=25,
         range=(0, 50),
         density=False,
         alpha=0.8,
         histtype='stepfilled',
         color="black",
         edgecolor='none')
fig.set_size_inches(15, 4)
fig.tight_layout()
fig.show()
plt.close()

# In[10]:

# Organizing the desired information into a standard set of column headers.
combine_columns = lambda row, columns: concatenate_with_delim(
    "|", [row[column] for column in columns])
df["species_code"] = "sly"
df["species_name"] = "tomato"
df["text_unprocessed"] = df["allele_phenotype"]
df["unique_gene_identifiers"] = df.apply(
    lambda x: combine_columns(x, ["locus", "locus_symbol"]), axis=1)
df["other_gene_identifiers"] = df.apply(lambda x: combine_columns(
    x, ["locus_name", "allele_symbol", "allele_name"]),
                                        axis=1)
df["gene_models"] = df["locus"]
df["annotations"] = ""
df["reference_name"] = "SGN"
df["reference_link"] = "https://solgenomics.net/"
df["reference_file"] = "sgn_tomato_phenotyped_loci.txt"
df = df[reshaped_columns]
df.head(10)
Ejemplo n.º 8
0
def clean_oryzabase_explainations(string):
    # Should be applied to the explaniation column in the dataset.
    # Returns a version of the value in that column without some of the redundant information.
    ontology_ids = get_ontology_ids(string)
    for ontology_id in ontology_ids:
        string = string.replace(ontology_id,"")
        string = remove_punctuation(string)
    return(string)


# In[14]:


# Restructuring and combining columns that have gene name information.
combine_columns = lambda row, columns: concatenate_with_delim("|", [row[column] for column in columns])
df["CGSNL Gene Symbol"] = df["CGSNL Gene Symbol"].apply(clean_oryzabase_symbol)
df["Gene symbol synonym(s)"] = df["Gene symbol synonym(s)"].apply(clean_oryzabase_symbol_synonyms)
df["CGSNL Gene Name"] = df["CGSNL Gene Name"].apply(lambda x: x.replace("_","").strip())
df["Gene name synonym(s)"] = df["Gene name synonym(s)"].apply(lambda x: replace_delimiter(text=x, old_delim=",", new_delim="|"))
df["gene_names"] = df.apply(lambda x: combine_columns(x, ["RAP ID","MUS ID","CGSNL Gene Symbol", "Gene symbol synonym(s)", "CGSNL Gene Name", "Gene name synonym(s)"]), axis=1)

# Restructuring and combining columns that have ontology term annotations.
df["Gene Ontology"] = df["Gene Ontology"].apply(lambda x: concatenate_with_delim("|", get_ontology_ids(x)))
df["Trait Ontology"] = df["Trait Ontology"].apply(lambda x: concatenate_with_delim("|", get_ontology_ids(x))) 
df["Plant Ontology"] = df["Plant Ontology"].apply(lambda x: concatenate_with_delim("|", get_ontology_ids(x))) 
df["term_ids"] = df.apply(lambda x: combine_columns(x, ["Gene Ontology","Trait Ontology","Plant Ontology"]), axis=1)

# Adding other expected columns and subsetting the dataset.
df["species"] = "osa"
df["description"] = df["Explanation"].apply(clean_oryzabase_explainations)
Ejemplo n.º 9
0
# In[5]:

df._gene_id.value_counts()

# In[6]:

# Now the dataset looks clean, but the gene columns don't yet reflect all the information
# that was used in the network creation step. For example, we want all the gene identifiers
# found on any row to be present everywhere in the dataset for that given line.

# In[7]:

agg_df = df.groupby("_gene_id").agg({
    "unique_gene_identifiers":
    lambda x: concatenate_with_delim("|", x),
    "other_gene_identifiers":
    lambda x: concatenate_with_delim("|", x),
    "gene_models":
    lambda x: concatenate_with_delim("|", x)
})

# In[8]:


# This is only called by collapse_by_all_gene_names().
# A method necessary for cleaning up lists of gene identifiers after merging.
# This removes things from the other gene identifiers if they are already listed as a unique gene identifier.
# This could happen after merging if some string was unsure about being a unique identifier, but some other entry confirms that is is.
def remove_duplicate_names(row):
    gene_names = row["unique_gene_identifiers"].split("|")