def analogy(self, x, y, z): """ y is to ??? what z is to x :param x: :param y: :param z: :return: """ x = dpu.encode_cell(x) y = dpu.encode_cell(y) z = dpu.encode_cell(z) indexes, metrics = self.M.analogy(pos=[x, y], neg=[z], n=10) res = self.M.generate_response(indexes, metrics).tolist() return res
def concept_qa(self, entity, relation, attribute, n=20, simf=SIMF.COSINE): entity = dpu.encode_cell(entity) indexes = [] metrics = [] if simf == SIMF.COSINE: indexes, metrics = self.M.cosine(entity, n=n) elif simf == SIMF.EUCLIDEAN: indexes, metrics = self.M.euclidean(entity, n=n) res = self.M.generate_response(indexes, metrics).tolist() vec_attribute = self.RE[relation]["columns"][attribute] candidate_attribute_sim = [] for e, score in res: vec_e = self.M.get_vector( e) # no need to normalize e --- it's already normalized distance = 0 if simf == SIMF.COSINE: distance = cosine(vec_e, vec_attribute) elif simf == SIMF.EUCLIDEAN: distance = euclidean(vec_e, vec_attribute) similarity = 1 - distance candidate_attribute_sim.append((e, similarity)) candidate_attribute_sim = sorted(candidate_attribute_sim, key=lambda x: x[1], reverse=True) return candidate_attribute_sim
def entity_to_attribute(self, entities, n=2, simf=SIMF.COSINE): res = [] for entity in entities: entity = dpu.encode_cell(entity) vec_e = self.M.get_vector(entity) topk = self.topk_columns(vec_e, k=n, simf=simf) res.append((entity, topk)) return res
def vector_for_entity(self, cell=None, attribute=None, table=None): vec = None if cell: cell = dpu.encode_cell(cell) vec = self.M.get_vector(cell) elif table: table = dpu.encode_cell(table) if attribute: attribute = dpu.encode_cell(attribute) vec = self.RE[table]["columns"][attribute] else: vec = self.RE[table]["vector"] elif attribute: attribute = dpu.encode_cell(attribute) print("Not supported yet!") return return vec
def _read_columns_from_dataframe(df, columns, format="TXT"): for c in columns: data_values = df[c] for el in data_values: el = dpu.encode_cell(el) if el == 'nan': # probably more efficient to avoid nan upstream continue yield el
def _read_rows_from_dataframe(df, columns): for index, el in df.iterrows(): for c in columns: cell_value = el[c] # clean cell_value cell_value = dpu.encode_cell(cell_value) if cell_value == 'nan': # probably more efficient to avoid nan upstream continue yield cell_value
def topk_similar_vectors(self, input_string, k=10, simf=SIMF.COSINE): el = dpu.encode_cell(input_string) indexes = [] metrics = [] if simf == SIMF.COSINE: indexes, metrics = self.M.cosine(el, n=k) elif simf == SIMF.EUCLIDEAN: indexes, metrics = self.M.euclidean(el, n=k) res = self.M.generate_response(indexes, metrics).tolist() return res
def _read_rows_from_dataframe(df, columns, format="TXT"): for index, el in df.iterrows(): row = [] for c in columns: cell_value = el[c] # clean cell_value cell_value = dpu.encode_cell(cell_value) if cell_value == 'nan': # probably more efficient to avoid nan upstream row.append("") else: row.append(cell_value) if format == "TXT": yield cell_value if format == "CSV": yield row
def row_avg_composition(path, we_model): missing_words = 0 row_we_dict = dict() df = pd.read_csv(path, encoding='latin1') columns = df.columns for i, row in df.iterrows(): row_wes = [] for c in columns: el = dpu.encode_cell(row[c]) try: we = we_model.get_vector(el) except KeyError: missing_words += 1 continue row_wes.append(we) row_wes = np.asarray(row_wes) row_we = np.mean(row_wes, axis=0) row_we_dict[i] = row_we return row_we_dict, missing_words
def column_avg_composition(path, we_model): column_we = dict() df = pd.read_csv(path, encoding='latin1') columns = df.columns missing_words = 0 for c in columns: col_wes = [] value = df[c] for el in value: el = dpu.encode_cell(el) try: vector = we_model.get_vector(el) except KeyError: missing_words += 1 continue col_wes.append(vector) col_wes = np.asarray(col_wes) col_we = np.mean(col_wes, axis=0) column_we[c] = col_we return column_we, missing_words
def _read_columns_from_dataframe(df, columns): for c in columns: data_values = df[c] for el in data_values: el = dpu.encode_cell(el) yield el
def similarity_between(self, entity1, entity2, simf=SIMF.COSINE): x = dpu.encode_cell(entity1) y = dpu.encode_cell(entity2) vec_x = self.M.get_vector(x) vec_y = self.M.get_vector(y) return self.similarity_between_vectors(vec_x, vec_y, simf=simf)