Esempio n. 1
0
	def similarity_jaccard(self, term_id_list_1, term_id_list_2, inherited=False):
		"""
		Find the similarity between two lists of ontology terms, by finding the Jaccard
		similarity between the two sets of all the terms that are inherited by each of
		the terms present in each list. 
		
		Args:
			term_id_list_1 (list of str): A list of ontology term IDs.

			term_id_list_2 (list of str): A list of ontology term IDs.

			inherited (bool, optional): Setting to true indicates that the lists already include all inherited terms. By default
			this is set to false indicating that the ontology graph structure should be used to find the additional terms inherited
			by terms in each of the passed in sets.
		
		Returns:
			float: The jaccard similarity between the two lists of terms.
		"""
		if inherited:
			term_id_set_1 = set(term_id_list_1)
			term_id_set_2 = set(term_id_list_2)
		else:
			inherited_term_list_1 = flatten(self._inherited_dict[term_id] for term_id in term_id_list_1)
			inherited_term_list_2 = flatten(self._inherited_dict[term_id] for term_id in term_id_list_2)
			term_id_set_1 = set(inherited_term_list_1)
			term_id_set_2 = set(inherited_term_list_2)

		intersection = term_id_set_1.intersection(term_id_set_2)
		union = term_id_set_1.union(term_id_set_2)
		return(len(intersection)/len(union))
Esempio n. 2
0
def _for_new_texts_get_annotations_vector(term_list, vectorizer, ontology):
    """Not called here, just used to store a method that contains a vectorizer to be used later.
	"""
    term_list = [ontology.subclass_dict.get(x, x) for x in term_list]
    term_list = flatten(term_list)
    term_list = list(set(term_list))
    joined_term_string = " ".join(term_list).strip()
    vector = vectorizer.transform([joined_term_string]).toarray()[0]
    return (vector)
Esempio n. 3
0
def build_validation_df_from_ontology(path):
    
    # Load the ontology and term information.
    ont = Ontology(path)
    term_ids_and_names = [(t.id,t.name) for t in ont.terms() if "obsolete" not in t.name]
    key_to_annotations = {i:[x[0]] for i,x in enumerate(term_ids_and_names)}
    key_to_term_id = {i:x[0] for i,x in enumerate(term_ids_and_names)}
    key_to_text_string = {i:x[1] for i,x in enumerate(term_ids_and_names)}
    key_to_preprocessed_text_string = {i:" ".join(preprocess_string(s)) for i,s in key_to_text_string.items()}
    
    # Get mappings that define which terms are very close to which others ones in the ontology structure.
    parents = {}
    children = {}
    for term in ont.terms():
        parents[term.id] = [t.id for t in term.superclasses(with_self=False, distance=1)]
        children[term.id] = [t.id for t in term.subclasses(with_self=False, distance=1)]
    siblings = {}
    for term in ont.terms():
        siblings[term.id] = flatten([[t for t in children[parent_id] if t!=term.id] for parent_id in parents[term.id]])
    assert len(parents) == len(children)
    assert len(parents) == len(siblings)
    any_close = {}
    for key in parents.keys():
        any_close[key] = flatten([parents[key],children[key],siblings[key]])
        
        
    df = pw.with_annotations(key_to_annotations, ont, "jaccard", tfidf=False).edgelist
    df = df[df["from"]!=df["to"]]
    df["from_id"] = df["from"].map(lambda x: key_to_term_id[x])
    df["to_id"] = df["to"].map(lambda x: key_to_term_id[x])
    df["from_text"] = df["from"].map(lambda x: key_to_text_string[x])
    df["to_text"] = df["to"].map(lambda x: key_to_text_string[x])
    df["close"] = df.apply(lambda x: x["to_id"] in any_close[x["from_id"]], axis=1)
    df["token_overlap"] = df.apply(lambda x: len(set(x["from_text"].split()).intersection(set(x["to_text"].split())))>0, axis=1)
    df.head(20)
    
    positive_df = df[(df["token_overlap"]==False) & (df["close"]==True)]
    negative_df = df[(df["token_overlap"]==False) & (df["close"]==False)]
    assert negative_df.shape[0]+positive_df.shape[0] == df[df["token_overlap"]==False].shape[0]
    num_positive_examples = positive_df.shape[0]
    validation_df = pd.concat([positive_df, negative_df.sample(num_positive_examples, random_state=2)])
    del df
    return(validation_df, key_to_preprocessed_text_string)
Esempio n. 4
0
def token_enrichment(all_ids_to_texts, group_ids):
    """ Obtain a dataframe with the results of a token enrichment analysis using Fisher exact test with the results sorted by p-value.
    
    Args:
        all_ids_to_texts (dict of int:str): A mapping between unique integer IDs (for genes) and some string of text.

        group_ids (list of int): The IDs which should be a subset of the dictionary argument that refer to those belonging to the group to be tested.
    
    Returns:
        pandas.DataFrame: A dataframe sorted by p-value that contains the results of the enrichment analysis with one row per token.
    """

    # Tokenize the strings of text to identify individual words and find all the unique tokens that appear anywhere in the texts.
    all_ids_to_token_lists = {
        i: word_tokenize(text)
        for i, text in all_ids_to_texts.items()
    }
    unique_tokens = list(set(flatten(all_ids_to_token_lists.values())))

    # For each token, determine the total number of texts that it is present in.
    num_ids_with_token_t = lambda t, id_to_tokens: [
        (t in tokens) for i, tokens in id_to_tokens.items()
    ].count(True)
    token_to_gene_count = {
        t: num_ids_with_token_t(t, all_ids_to_token_lists)
        for t in unique_tokens
    }
    total_num_of_genes = len(all_ids_to_token_lists)
    df = pd.DataFrame(unique_tokens, columns=["token"])
    df["genes_with"] = df["token"].map(lambda x: token_to_gene_count[x])
    df["genes_without"] = total_num_of_genes - df["genes_with"]

    # For each token, determine the total number of texts that belong to the group to be tested that it is present in.
    num_of_genes_in_group = len(group_ids)
    ids_in_group_to_token_lists = {
        i: tokens
        for i, tokens in all_ids_to_token_lists.items() if i in group_ids
    }
    token_to_gene_in_group_count = {
        t: num_ids_with_token_t(t, ids_in_group_to_token_lists)
        for t in unique_tokens
    }
    df["group_genes_with"] = df["token"].map(
        lambda x: token_to_gene_in_group_count[x])
    df["group_genes_without"] = num_of_genes_in_group - df["group_genes_with"]

    # Using those values, perform the Fisher exact test to obtain a p-value for each term, sort the results, and return.
    df["p_value"] = df.apply(lambda row: fisher_exact([[
        row["group_genes_with"], row["genes_with"]
    ], [row["group_genes_without"], row["genes_without"]]])[1],
                             axis=1)
    df.sort_values(by="p_value", inplace=True)
    df.reset_index(inplace=True, drop=True)
    return (df)
Esempio n. 5
0
	def similarity_ic(self, term_id_list_1, term_id_list_2, inherited=False, as_weight=True):
		"""
		Find the similarity between two lists of ontology terms, by finding the information 
		content of the most specific term that is shared by the sets of all terms inherited
		by all terms in each list. In this case, the most specific term is the term with
		maximum information content.

		Args:
			term_id_list_1 (list of str): A list of ontology term IDs.

			term_id_list_2 (list of str): A list of ontology term IDs.

			inherited (bool, optional): Setting to true indicates that the lists already include all inherited terms. By default
			this is set to false indicating that the ontology graph structure should be used to find the additional terms inherited
			by terms in each of the passed in sets.
		
		Returns:
			float: The maximum information content of any common ancestor between the two term lists.
		"""
		if inherited:
			term_id_set_1 = set(term_id_list_1)
			term_id_set_2 = set(term_id_list_2)
		else:
			inherited_term_list_1 = flatten(self._inherited_dict[term_id] for term_id in term_id_list_1)
			inherited_term_list_2 = flatten(self._inherited_dict[term_id] for term_id in term_id_list_2)
			term_id_set_1 = set(inherited_term_list_1)
			term_id_set_2 = set(inherited_term_list_2)

		intersection = list(term_id_set_1.intersection(term_id_set_2))
		if as_weight:
			intersection_ic_values = [self._graph_based_ic_dict_as_weights[term_id] for term_id in intersection]
		else:
			intersection_ic_values = [self._graph_based_ic_dict[term_id] for term_id in intersection]
		if len(intersection_ic_values) == 0:
			return(0.000)
		return(max(intersection_ic_values))
Esempio n. 6
0
def concatenate_with_delim(delim, elements):
	"""
	Concatenates the strings in the passed in list with a specific delimiter and returns
	the resulting string. This is useful when preparing strings that are intended to be
	placed within a table object or delim-separated text file. Any of the input strings
	can themselves already be representing delim-separated lists, and this will be
	accounted for. 
	
	Args:
	    elements (list of str): A list of strings that represent either lists or list elements.
	
	Returns:
	    str: A text string representing a list that is delimited by the provided delimiter.s
	"""
	tokens = [token.split(delim) for token in elements]
	tokens = flatten(tokens)
	tokens = filter(None, tokens)
	tokens = [token.strip() for token in tokens]
	tokens = remove_duplicates_retain_order(tokens)
	joined = delim.join(tokens).strip()
	joined = remove_newlines(joined)
	return(joined)
Esempio n. 7
0
def concatenate_texts(texts):
	"""
	Combines multiple description strings into a single string. This is different than a simple join with
	whitespace, because it handles additional formatting which is assumed to be necessary for texts that
	are either fragments or full sentences. This includes removing duplicates that differ only by punctuation
	or capitalization, retaining the specific order of the texts, and making sure they are capitalized and 
	punctuated in a standard way that will be parseable by other packages and functions that deal with text.
	
	Args:
	    texts (list of str): A list of arbitrary strings.
	
	Returns:
	    str: The text string that results from concatenating and formatting these text strings.
	"""
	texts = [text.replace(";",".") for text in texts]
	texts = [add_end_character(text.strip()) for text in texts]
	texts = flatten([sent_tokenize(text) for text in texts])
	texts = remove_text_duplicates_retain_order(texts)
	texts = ["{}{}".format(text[0].upper(), text[1:]) for text in texts]
	text = " ".join(texts).strip()
	text = remove_newlines(text)
	return(text)
Esempio n. 8
0
def term_enrichment(all_ids_to_annotations,
                    group_ids,
                    ontology,
                    inherited=False):
    """ Obtain a dataframe with the results of a term enrichment analysis using Fisher exact test with the results sorted by p-value.
    
    Args:
        all_ids_to_annotations (dict of int:list of str): A mapping between unique integer IDs (for genes) and list of ontology term IDs annotated to them.

        group_ids (list of int): The IDs which should be a subset of the dictionary argument that refer to those belonging to the group to be tested.
        
        ontology (oats.annotation.ontology.Ontology): An ontology object that shoud match the ontology from which the annotations are drawn.
        
        inherited (bool, optional): By default this is false to indicate that the lists of ontology term IDs have not already be pre-populated to include the terms that are 
        superclasses of the terms annotated to that given ID. Set to true to indicate that these superclasses are already accounted for and the process of inheriting additional
        terms should be skipped.
    
    Returns:
        pandas.DataFrame: A dataframe sorted by p-value that contains the results of the enrichment analysis with one row per ontology term.
    """

    # If it has not already been performed for this data, using the ontology structure to inherit additional terms from these annotations.
    if inherited:
        all_ids_to_inherited_annotations = all_ids_to_annotations
    else:
        all_ids_to_inherited_annotations = {
            i: ontology.inherited(terms)
            for i, terms in all_ids_to_annotations.items()
        }

    # Find the list of all the unique ontology term IDs that appear anywhere in the annotations.
    unique_term_ids = list(
        set(flatten(all_ids_to_inherited_annotations.values())))

    # For each term, determine the total number of (gene) IDs that it is annotated to.
    num_ids_annot_with_term_t = lambda t, id_to_terms: [
        (t in terms) for i, terms in id_to_terms.items()
    ].count(True)
    term_id_to_gene_count = {
        t: num_ids_annot_with_term_t(t, all_ids_to_inherited_annotations)
        for t in unique_term_ids
    }
    total_num_of_genes = len(all_ids_to_inherited_annotations)
    df = pd.DataFrame(unique_term_ids, columns=["term_id"])
    df["term_label"] = df["term_id"].map(lambda x: _get_term_name(x, ontology))
    df["genes_with"] = df["term_id"].map(lambda x: term_id_to_gene_count[x])
    df["genes_without"] = total_num_of_genes - df["genes_with"]

    # For each term, determine the total nubmer of (gene) IDs within the group to be tested that it is annotated to.
    num_of_genes_in_group = len(group_ids)
    ids_in_group_to_inherited_annotations = {
        i: terms
        for i, terms in all_ids_to_inherited_annotations.items()
        if i in group_ids
    }
    term_id_to_gene_in_group_count = {
        t: num_ids_annot_with_term_t(t, ids_in_group_to_inherited_annotations)
        for t in unique_term_ids
    }
    df["group_genes_with"] = df["term_id"].map(
        lambda x: term_id_to_gene_in_group_count[x])
    df["group_genes_without"] = num_of_genes_in_group - df["group_genes_with"]

    # Using those values, perform the Fisher exact test to obtain a p-value for each term, sort the results, and return.
    df["p_value"] = df.apply(lambda row: fisher_exact([[
        row["group_genes_with"], row["genes_with"]
    ], [row["group_genes_without"], row["genes_without"]]])[1],
                             axis=1)
    df.sort_values(by="p_value", inplace=True)
    df.reset_index(inplace=True, drop=True)
    return (df)
Esempio n. 9
0
def with_annotations(ids_to_annotations,
                     ontology,
                     metric,
                     tfidf=False,
                     **kwargs):
    """
	Find distance between nodes of interest in the input dictionary based on the overlap in the
	ontology terms that are mapped to those nodes. The input terms for each ID are in the format
	of lists of term IDs. All inherited terms by all these terms will be added in this function 
	using the provided ontology object so that each node will be represented by the union of all 
	the terms inherited by the terms annotated to it. After that step, the term IDs are simply 
	treated as words in a vocabulary, and the same approach as with n-grams is used to generate 
	the distance matrix.
	
	Args:
	    ids_to_annotations (dict): A mapping between IDs and a list of ontology term ID strings.
	    
	    ontology (Ontology): Ontology object with all necessary fields.
	    
	    metric (str): A string indicating which distance metric should be used (e.g., cosine).
	    
	    tfidf (bool, optional): Whether to use TFIDF weighting or not.
	    
	    **kwargs: All the keyword arguments that can be passed to sklearn.feature_extraction.CountVectorizer()
	
	Returns:
	    oats.pairwise.SquarePairwiseDistances: Distance matrix and accompanying information.

	"""

    # Generate the vector representations of each set of annotations by first inheriting terms then converting to strings.
    ids_to_term_lists = {
        i: list(
            set(flatten([ontology.inherited(term_id)
                         for term_id in term_list])))
        for i, term_list in ids_to_annotations.items()
    }
    ids_to_joined_term_strings = {
        i: " ".join(term_list).strip()
        for i, term_list in ids_to_term_lists.items()
    }
    joined_term_strings_list = ids_to_joined_term_strings.values()
    vectors, vectorizer = vectorize_with_ngrams(joined_term_strings_list,
                                                tfidf=tfidf,
                                                **kwargs)
    joined_term_strings_to_vector_mapping = {
        term_list_string: vector
        for term_list_string, vector in zip(joined_term_strings_list, vectors)
    }

    # Send the relevant functions and arguments to the general case method for generating the distance matrix object.s
    to_vector_function = lambda term_list_string, mapping=joined_term_strings_to_vector_mapping: mapping[
        term_list_string]
    to_vector_kwargs = {}
    return (_pairwise_square_general_case(
        ids_to_something=ids_to_joined_term_strings,
        to_vector_now=to_vector_function,
        to_vector_now_kwargs=to_vector_kwargs,
        to_vector_later=_for_new_texts_get_annotations_vector,
        to_vector_later_kwargs={
            "vectorizer": vectorizer,
            "ontology": ontology
        },
        metric=metric))
Esempio n. 10
0
    term_ids_and_names_with_synonyms.append((i," ".join(ont.term_to_tokens[i])))

key_to_annotations = {i:[x[0]] for i,x in enumerate(term_ids_and_names)}
key_to_term_id = {i:x[0] for i,x in enumerate(term_ids_and_names)}
key_to_text_string = {i:x[1] for i,x in enumerate(term_ids_and_names_with_synonyms)}
key_to_preprocessed_text_string = {i:" ".join(preprocess_string(s)) for i,s in key_to_text_string.items()}

# Get mappings that define which terms are very close to which others ones in the ontology structure.
parents = {}
children = {}
for term in ont.terms():
    parents[term.id] = [t.id for t in term.superclasses(with_self=False, distance=1)]
    children[term.id] = [t.id for t in term.subclasses(with_self=False, distance=1)]
siblings = {}
for term in ont.terms():
    siblings[term.id] = flatten([[t for t in children[parent_id] if t!=term.id] for parent_id in parents[term.id]])
assert len(parents) == len(children)
assert len(parents) == len(siblings)
any_close = {}
for key in parents.keys():
    any_close[key] = flatten([parents[key],children[key],siblings[key]])
    any_close[key] = flatten([parents[key],children[key]])

df = pw.with_annotations(key_to_annotations, ont, "jaccard", tfidf=False).edgelist
df = df[df["from"]!=df["to"]]
df["from_id"] = df["from"].map(lambda x: key_to_term_id[x])
df["to_id"] = df["to"].map(lambda x: key_to_term_id[x])
df["from_text"] = df["from"].map(lambda x: key_to_text_string[x])
df["to_text"] = df["to"].map(lambda x: key_to_text_string[x])
df["close"] = df.apply(lambda x: x["to_id"] in any_close[x["from_id"]], axis=1)
df["token_overlap"] = df.apply(lambda x: len(set(x["from_text"].split()).intersection(set(x["to_text"].split())))>0, axis=1)
# For each individual species.
for species in data.get_species():
    df = data.to_pandas()
    subset = df[df["species"] == species]
    sentences = [sent_tokenize(d) for d in subset["descriptions"].values]
    descriptions_not_stemmed = [
        simple_preprocess(d) for d in subset["descriptions"].values
    ]
    descriptions_stemmed = [
        preprocess_string(d) for d in subset["descriptions"].values
    ]
    descriptions_lemmatized = [
        lemmatize_doc(d) for d in subset["descriptions"].values
    ]
    sent_lists[species] = flatten(sentences)
    token_lists[species] = flatten(descriptions_not_stemmed)
    stems_lists[species] = flatten(descriptions_stemmed)
    lemma_lists[species] = flatten(descriptions_lemmatized)

    # What about the distributions of words per gene and sentences per gene?
    dists["species"].extend([species] * subset.shape[0])
    dists["num_words"].extend(
        [len(word_tokenize(x)) for x in subset["descriptions"].values])
    dists["num_sents"].extend(
        [len(sent_tokenize(x)) for x in subset["descriptions"].values])

# For the entire dataset including all of the species.
df = data.to_pandas()
subset = df
sentences = [sent_tokenize(d) for d in subset["descriptions"].values]
Esempio n. 12
0
def reduce_vocab_linares_pontes(descriptions, tokens, distance_matrix, n):
    """
	Implementation of the algorithm described in the paper cited below. In short, this returns the 
	descriptions with each word replaced by the most frequently used token in the set of tokens that 
	consists of that word and the n most similar words as given by the distance matrix provided. Some 
	values of n that are used in the papers are 1, 2, and 3. Note that the descriptions in the passed 
	in dictionary should already be preprocessed in whatever way is necessary, but they should atleast
	be formatted as lowercase tokens that are separated by a single space in each description. The
	tokens in the list of tokens should be pulled directly from those descriptions and be found
	by splitting by a single space. They are passed in as a separate list though because the index
	of the token in the list has to correspond to the index of that token in the distance matrix. 
	If the descriptions contain any tokens that are not present in the tokens list will not be affected
	when altering the tokens that are present in the descriptions.

	Elvys Linhares Pontes, Stéphane Huet, Juan-Manuel Torres-Moreno, Andréa Carneiro Linhares. 
	Automatic Text Summarization with a Reduced Vocabulary Using Continuous Space Vectors. 
	21st International Conference on Applications of Natural Language to Information Systems (NLDB),
	2016, Salford, United Kingdom. pp.440-446, ff10.1007/978-3-319-41754-7_46ff. ffhal-01779440
	
	Args:
		descriptions (dict): A mapping between IDs and text descriptions.
		
		tokens (list): A list of strings which are tokens that appear in the descriptions. 
		
		distance_matrix (np.array): A square array of distances between the ith and jth token in the tokens list. 
		
		n (int): The number of most similar words to consider when replacing a word in building the reduced vocabulary.
	
	Returns:
		dict: Mapping between IDs and text descriptions with reduced vocabulary, matches input.

		dict: Mapping between tokens present in the original vocab and the token it is replaced with in the reduced vocabulary.

		dict: Mapping between tokens present in the reduced vocab and lists of corresponding original vocabulary tokens.
	"""

    # Find the frequency distribution of all of the tokens in the passed in descriptions.
    fdist = FreqDist(
        flatten([[token for token in description.split()]
                 for description in descriptions.values()]))
    token_to_index = {token: i for i, token in enumerate(tokens)}
    index_to_token = {i: token for i, token in enumerate(tokens)}

    # Create a mapping between all the tokens and the words they'll be replaced with.
    token_to_reduced_vocab_token = {}
    reduced_vocab_token_to_tokens = defaultdict(list)
    for token in tokens:
        index = token_to_index[token]
        n_indices = np.argpartition(distance_matrix[index], n)[:n]
        n_tokens = [index_to_token[idx] for idx in list(n_indices)]
        n_frequencies = [fdist[token] for token in n_tokens]
        maxfreq_token = n_tokens[np.argmax(n_frequencies)]
        token_to_reduced_vocab_token[token] = maxfreq_token
        reduced_vocab_token_to_tokens[maxfreq_token].append(token)

    # Do the replacements in each input description and return the modified dictionary of them.
    reduced_descriptions = {}
    for i, description in descriptions.items():
        #reduced_description = " ".join([token_to_reduced_vocab_token[token] for token in description.split()])
        reduced_description = " ".join([
            token_to_reduced_vocab_token.get(token, token)
            for token in description.split()
        ])
        reduced_descriptions[i] = reduced_description
    return (reduced_descriptions, token_to_reduced_vocab_token,
            reduced_vocab_token_to_tokens)