def get_analogy(n_clicks, pos_1, neg_1, pos_2): if n_clicks is not None and \ pos_1 is not None and \ pos_1 != "" and \ neg_1 is not None and \ neg_1 != "" and \ pos_2 is not None and \ pos_2 != "": ee = EmbeddingEngine() pos_1 = ee.phraser[ee.dp.process_sentence(pos_1.split())[0]] neg_1 = ee.phraser[ee.dp.process_sentence(neg_1.split())[0]] pos_2 = ee.phraser[ee.dp.process_sentence(pos_2.split())[0]] pos_1_vec = ee.get_word_vector(pos_1[0]) neg_1_vec = ee.get_word_vector(neg_1[0]) pos_2_vec = ee.get_word_vector(pos_2[0]) if pos_1_vec is not None and neg_1_vec is not None and pos_2_vec is not None: diff_vec = pos_2_vec + pos_1_vec - neg_1_vec norm_diff = diff_vec / np.linalg.norm(diff_vec, axis=0) # unit length close_words = ee.close_words(norm_diff, exclude_self=False)[0] print(close_words) for close_word in close_words: if close_word not in [pos_1[0], neg_1[0], pos_2[0]]: return close_word.replace("_", " ") else: return "?" else: return "?"
def get_relevant_materials(_, search_text, n_search_text, plus_elems, minus_elems): if search_text is not None and search_text != "": ee = EmbeddingEngine() # the positive word vectors sentence = ee.phraser[ee.dp.process_sentence(search_text.split())] # the negative word vectors n_sentence = ee.phraser[ee.dp.process_sentence(n_search_text.split())] \ if n_search_text is not None and len(n_search_text) > 0 else None # finding materials sorted by similarity most_similar = ee.find_similar_materials( sentence=sentence, n_sentence=n_sentence, min_count=15, use_output_emb=True) # filtering the results by elements and returning top 50 elem_filtered = ee.filter_by_elements(most_similar, plus_elems, minus_elems, max=50) # display top 50 results matlist = ee.most_common_form(elem_filtered[:50]) material_names, material_scores, material_counts, _ = zip(*matlist) return matlist_figure([number_to_substring(name) for name in material_names], material_scores, material_counts) else: return ""
def get_similar_words(_, word): if word is not None and word != "": ee = EmbeddingEngine() close_words, scores = ee.close_words(word) return [ html.Span([ "({:.2f}) {}".format(scores[i], close_word.replace("_", " ")), html.Br() ]) for i, close_word in enumerate(close_words) ] else: return ""
class MatSearch(Resource): EE = EmbeddingEngine() @require_api_key def get(self, wordphrase, top_k=100): try: response = { "valid_response": True, "response": { 'original_wordphrase': wordphrase, 'materials': self.EE.find_similar_materials(wordphrase, min_count=10)[0:top_k] } } status_code = status.HTTP_200_OK except: response = { "valid_response": False, "error": "Could not get synonyms." } status_code = status.HTTP_400_BAD_REQUEST response = jsonify(response) response.status_code = status_code return response
def phrase_tokens(self): def ungroup_tokens(toks): new_toks = [] for t_r in toks: new_toks.append([]) for t in t_r: for ii, elem in enumerate(t["text"]): new_toks[-1].append({"text": elem, "pos": t["pos"][ii], "annotation": t["annotation"]}) return new_toks grouped_toks = self.group_and_process() ee = EmbeddingEngine() for row_idx, tokenRow in enumerate(grouped_toks): for idx, token in enumerate(tokenRow): grouped_toks[row_idx][idx]["text"] = ee.phraser[ee.dp.process_sentence( token["text"] if type(token["text"]) is list else [token["text"]])] new_pos_tags = [] for i, tok in enumerate(grouped_toks[row_idx][idx]["text"]): p_l = len(new_pos_tags) if "_" not in tok: new_pos_tags.append(grouped_toks[row_idx][idx]["pos"][p_l]) else: new_pos_tags.append("_".join([ grouped_toks[row_idx][idx]["pos"][k] for k in range(p_l, p_l+len(tok.split("_"))) ])) grouped_toks[row_idx][idx]["pos"] = new_pos_tags return ungroup_tokens(grouped_toks)
class EmbeddingResource(Resource): EE = EmbeddingEngine() embedding_schema = EmbeddingSchema() embeddings_schema = EmbeddingSchema(many=True) def _prepare_response(self, wordphrases): try: embeddings = [] for wp in wordphrases: compound = False embedding = Embedding(wp, '', self.EE.get_word_vector(wp), compound=False) if " " in wp and embedding.embedding is None: embeddings.append( Embedding(wp, '', self.EE.get_word_vector(wp), compound=True)) else: embeddings.append(embedding) response = { "valid_response": True, "response": self.embeddings_schema.dump(embeddings) } status_code = status.HTTP_200_OK except: response = { "valid_response": False, "error": "Something went wrong..." } status_code = status.HTTP_400_BAD_REQUEST response = jsonify(response) response.status_code = status_code return response @require_api_key def get(self, wordphrase): wps = wordphrase.split(',') return self._prepare_response(wps) @require_api_key def post(self): json_data = request.get_json(force=True) print(json_data, "look") try: wordphrases = json_data["wordphrase"] wps = wordphrases.split(',') return self._prepare_response(wps) except KeyError as err: response = { "valid_response": False, "error": "Provided json file does not contain wordphrases." } response = jsonify(response) response.status_code = status.HTTP_400_BAD_REQUEST return response
def get_similar_words(_, word): if word is not None and word != "": ee = EmbeddingEngine() close_words, scores = ee.close_words(word, top_k=8) print(close_words) return dt.DataTable(rows=[{ "#": i + 1, 'Words and phrases similar to "{}"'.format(word): w.replace("_", " "), "Cosine similarity": int(scores[i] * 1000) / 1000 } for i, w in enumerate(close_words)], row_selectable=False, filterable=False, editable=False, sortable=False, column_widths=[25, None, 140], id='analogies_table') # return [html.Span(["({:.2f}) {}".format(scores[i], close_word.replace("_", " ")), html.Br()]) # for i, close_word in enumerate(close_words)] else: return ""
def __init__(self): """ The constructor for the Cluster Plot object :param entity_type: 'all' or 'materials' :param limit: number of most common entities to plot :param heatphrase: color according to similarity to this phrase :param wordphrases: filter to show only the specified phrases """ ds = np.DataSource() # material_names_url = "https://s3-us-west-1.amazonaws.com/materialsintelligence/material_map_tsne_words.npy" material_coords_url = "https://s3-us-west-1.amazonaws.com/materialsintelligence/final_material_map_atl10_30_ee12_lr200.npy" # ds.open(material_names_url) ds.open(material_coords_url) self.ee = EmbeddingEngine() self.embs = self.ee.embeddings / self.ee.norm # materials_json = urlopen("https://s3-us-west-1.amazonaws.com/matstract/material_map_10_mentions.json") # materials_data = materials_json.read().decode("utf-8") # self.materials_tsne_data = json.loads(materials_data)["data"][0] # self.norm_matnames = [self.ee.dp.get_norm_formula(m) for m in self.materials_tsne_data["text"]] # self.matname2index = dict() # for i, label in enumerate(self.norm_matnames): # self.matname2index[label] = i self.materials_tsne_data = np.load(ds.abspath(material_coords_url)) formula_counts = dict() for formula in self.ee.formulas_full: formula_counts[formula] = 0 for elem in self.ee.formulas_full[formula]: formula_counts[formula] += self.ee.formulas_full[formula][elem] mat_counts = sorted(formula_counts.items(), key=lambda x: x[1], reverse=True) mat_counts = [mat_count for mat_count in mat_counts if mat_count[1] >= 10] self.norm_matnames = [m[0] for m in mat_counts] self.matname2index = dict() for i, label in enumerate(self.norm_matnames): self.matname2index[label] = i
def phrase_tokens(self): def ungroup_tokens(toks): new_toks = [] for t_r in toks: new_toks.append([]) for t in t_r: for ii, elem in enumerate(t["text"]): new_toks[-1].append({"text": elem, "pos": t["pos"][ii], "annotation": t["annotation"]}) return new_toks grouped_toks = self.group_and_process() ee = EmbeddingEngine() for row_idx, tokenRow in enumerate(grouped_toks): for idx, token in enumerate(tokenRow): # processing the sentence processesed_sentence, split_indices = ee.dp.process_sentence( token["text"] if type(token["text"]) is list else [token["text"]]) grouped_toks[row_idx][idx]["text"] = ee.phraser[processesed_sentence] # some tokens are split during processing so need to update pos tags processed_pos = [] for ii, pos in enumerate(grouped_toks[row_idx][idx]["pos"]): processed_pos += [pos] if ii not in split_indices else [pos, pos] grouped_toks[row_idx][idx]["pos"] = processed_pos # grouping words together new_pos_tags = [] for i, tok in enumerate(grouped_toks[row_idx][idx]["text"]): p_l = len(new_pos_tags) if "_" not in tok: new_pos_tags.append(grouped_toks[row_idx][idx]["pos"][p_l]) else: new_pos_tags.append("_".join([ grouped_toks[row_idx][idx]["pos"][k] for k in range(p_l, p_l+len(tok.split("_"))) ])) grouped_toks[row_idx][idx]["pos"] = new_pos_tags return ungroup_tokens(grouped_toks)
class Synonyms(Resource): EE = EmbeddingEngine() @require_api_key def get(self, wordphrase, top_k=8): try: response = { "valid_response": True, "response": { 'original_wordphrase': wordphrase, 'synonyms': self.EE.close_words(wordphrase, top_k) } } status_code = status.HTTP_200_OK except: response = { "valid_response": False, "error": "Could not get synonyms." } status_code = status.HTTP_400_BAD_REQUEST response = jsonify(response) response.status_code = status_code return response
from flask import Flask, request, jsonify from flask_restful import Api from stract.api.models import * from matstract.models.database import AtlasConnection from matstract.models.word_embeddings import EmbeddingEngine from matstract.models.cluster_plot import ClusterPlot from matstract.models.search import Search from matstract.models.similar_materials import SimilarMaterials from matstract.models.errors import * import json db = AtlasConnection() ee = EmbeddingEngine() app = Flask(__name__) api = Api(app) cp = ClusterPlot() # endpoint to test @app.route('/api/test/<message>', methods=["Get"]) def test_api(message): messages = message.split(',') test = [APITest(message) for message in messages] return TestSchema(many=True).jsonify(test) # endpoint to abstracts @app.route('/api/abstracts/<abstract_id>', methods=["GET"]) def retrieve_abstracts(abstract_id): abstract_ids = abstract_id.split(',')