def analogy(self, x, y, z): """ y is to ??? what z is to x :param x: :param y: :param z: :return: """ x = dpu.encode_cell(x) y = dpu.encode_cell(y) z = dpu.encode_cell(z) indexes, metrics = self.M.analogy(pos=[x, y], neg=[z], n=10) res = self.M.generate_response(indexes, metrics).tolist() return res
def evaluate_table_attributes(api, args, table_df, entity_attribute, table_name, target_attribute, ranking_size=10, debug=True): """ Given a table dataframe (pandas), an entity attribute and a target attribute, makes questions and records the position of the found answers :param api: relational_emb api object :param args: arguments passed to the program :param table_df: dataframe holding the table of interest (pandas dataframe) :param entity_attribute: attribute in table_df from where to draw entities :param target_attribute: attribute in table_df for which we want to predict the answer :param ranking_size: the size of the ranking :return: """ should_sample = args.sample evaluation_results = defaultdict(int) num_questions = 0 key_error = 0 qs = 0 # Iterate rows of table to draw entity and target_attribute for index, el in table_df.iterrows(): if should_sample: if random.randint(1, 10) > 1: continue qs += 1 if (qs % 100) == 0: print("#q: " + str(qs)) entity = dpu.encode_cell(el[entity_attribute]) ground_truth = dpu.encode_cell(el[target_attribute]) try: ranking_result = api.concept_qa(entity, table_name, target_attribute, n=ranking_size) # Record in which position does the right answer appear, if it does for index, entry in enumerate(ranking_result): answer, score = entry found = (answer == ground_truth) if found: evaluation_results[index] += 1 break num_questions += 1 # One more question except KeyError: key_error += 1 # We only recorded the first position where an answer appears, accumulate results to get easy-to-interpret perc total_hits = 0 for index in range(ranking_size): evaluation_results[index] += total_hits total_hits = evaluation_results[index] return evaluation_results, num_questions, key_error
def window_column(paths, output_file, debug=False): try: os.remove(output_file) except FileNotFoundError: print("Creating new file for writing data") total = len(paths) current = 0 for path in paths: if debug: print(str(current) + "/" + str(total)) current += 1 df = pd.read_csv(path, encoding='latin1') # Check for valid relations only if not dpu.valid_relation(df): continue columns = df.columns f = csv.writer(open(output_file, 'a'), delimiter=',', quotechar='\"', quoting=csv.QUOTE_MINIMAL) # Columns for c in columns: col_data = df[c] row = [ dpu.encode_cell(cell_value) for cell_value in col_data if dpu.valid_cell(cell_value) ] if len(row) > 0: f.writerow(row) # TODO: why is it necessary to indicate end of relation? f.writerow(["~R!RR*~"])
def column_avg_unique_composition(df, we_model): column_we = dict() columns = df.columns missing_words = 0 for c in columns: col_wes = [] value = df[c].unique() for el in value: # Check validity of cell if not dpu.valid_cell(el): continue el = dpu.encode_cell(el) if " " in el: els = el.split(" ") vector = we_model.get_vector(els[0]) missing_words_mini = 0 for ee in range(1, len(els)): try: vector += we_model.get_vector(els[1]) except KeyError: missing_words += 1 missing_words_mini += 1 vector /= (len(els) - missing_words_mini) else: try: vector = we_model.get_vector(el) except KeyError: missing_words += 1 continue col_wes.append(vector) col_wes = np.asarray(col_wes) col_we = np.mean(col_wes, axis=0) column_we[c] = col_we return column_we, missing_words
def row_avg_composition(df, we_model): missing_words = 0 row_we_dict = dict() columns = df.columns for i, row in df.iterrows(): row_wes = [] for c in columns: # Check validity of cell if not dpu.valid_cell(row[c]): continue el = dpu.encode_cell(row[c]) if " " in el: els = el.split(" ") vector = we_model.get_vector(els[0]) missing_words_mini = 0 for ee in range(1, len(els)): try: vector += we_model.get_vector(els[1]) except KeyError: missing_words += 1 missing_words_mini += 1 vector /= (len(els) - missing_words_mini) else: try: vector = we_model.get_vector(el) except KeyError: missing_words += 1 continue row_wes.append(vector) row_wes = np.asarray(row_wes) row_we = np.mean(row_wes, axis=0) row_we_dict[i] = row_we return row_we_dict, missing_words
def vector_for_entity(self, cell=None, attribute=None, table=None): vec = None if cell: cell = dpu.encode_cell(cell) vec = self.M.get_vector(cell) elif table: table = dpu.encode_cell(table) if attribute: attribute = dpu.encode_cell(attribute) vec = self.RE[table]["columns"][attribute] else: vec = self.RE[table]["vector"] elif attribute: attribute = dpu.encode_cell(attribute) print("Not supported yet!") return return vec
def entity_to_attribute(self, entities, n=2, simf=SIMF.COSINE): res = [] for entity in entities: entity = dpu.encode_cell(entity) vec_e = self.M.get_vector(entity) topk = self.topk_columns(vec_e, k=n, simf=simf) res.append((entity, topk)) return res
def _read_columns_from_dataframe(df, columns): for c in columns: data_values = df[c] for cell_value in data_values: # We check the cell value is valid before continuing if not dpu.valid_cell(cell_value): continue cell_value = dpu.encode_cell(cell_value) yield cell_value
def topk_similar_vectors(self, input_string, k=10, simf=SIMF.COSINE): el = dpu.encode_cell(input_string) indexes = [] metrics = [] if simf == SIMF.COSINE: indexes, metrics = self.M.cosine(el, n=k) elif simf == SIMF.EUCLIDEAN: indexes, metrics = self.M.euclidean(el, n=k) res = self.M.generate_response(indexes, metrics).tolist() return res
def _read_rows_from_dataframe(df, columns): for index, el in df.iterrows(): for c in columns: cell_value = el[c] # We check the cell value is valid before continuing if not dpu.valid_cell(cell_value): continue # If valid, we clean and format it and return it cell_value = dpu.encode_cell(cell_value) yield cell_value
def concept_qa(self, entity, relation, attribute, n=20, simf=SIMF.COSINE): entity = dpu.encode_cell(entity) if " " in entity: # We have spaces/words now!! entity_words = entity.split(" ") indexes = [] metrics = [] if simf == SIMF.COSINE: if " " in entity: indexes, metrics = self.M.cosine_array(entity_words, n=n) # print(indexes) else: indexes, metrics = self.M.cosine(entity, n=n) elif simf == SIMF.EUCLIDEAN: indexes, metrics = self.M.euclidean(entity, n=n) #SPACES UNIMPLEMENTED TODO: res = self.M.generate_response(indexes, metrics).tolist() res = [(e, self.re_range_score(score)) for e, score in res] vec_attribute = self.RE[relation]["columns"][attribute] if type(vec_attribute) is not np.ndarray: # print(attribute) return [] # vec_attribute = self.RE[relation+"."+attribute] candidate_attribute_sim = [] for e, score in res: vec_e = self.M.get_vector(e) # no need to normalize e --- it's already normalized similarity_to_attr = 0 if simf == SIMF.COSINE: similarity_to_attr = np.dot(vec_e, vec_attribute) similarity_to_attr = self.re_range_score(similarity_to_attr) # distance_to_attr = cosine(vec_e, vec_attribute) elif simf == SIMF.EUCLIDEAN: similarity_to_attr = 1 - euclidean(vec_e, vec_attribute) # avg distance between original entity to each ranking entity and each ranking entity and target attr similarity = (similarity_to_attr + score) / 2 candidate_attribute_sim.append((e, similarity)) candidate_attribute_sim = sorted(candidate_attribute_sim, key=lambda x: x[1], reverse=True) return candidate_attribute_sim
def _concept_qa_no_avg_rerank(self, entity, relation, attribute, n=20, simf=SIMF.COSINE): entity = dpu.encode_cell(entity) indexes = [] metrics = [] if simf == SIMF.COSINE: indexes, metrics = self.M.cosine(entity, n=n) elif simf == SIMF.EUCLIDEAN: indexes, metrics = self.M.euclidean(entity, n=n) res = self.M.generate_response(indexes, metrics).tolist() vec_attribute = self.RE[relation]["columns"][attribute] # vec_attribute = self.RE[relation+"."+attribute] candidate_attribute_sim = [] for e, score in res: vec_e = self.M.get_vector(e) # no need to normalize e --- it's already normalized similarity = 0 if simf == SIMF.COSINE: similarity = np.dot(vec_e, vec_attribute) similarity = self.re_range_score(similarity) elif simf == SIMF.EUCLIDEAN: similarity = 1 - euclidean(vec_e, vec_attribute) candidate_attribute_sim.append((e, similarity)) candidate_attribute_sim = sorted(candidate_attribute_sim, key=lambda x: x[1], reverse=True) return candidate_attribute_sim
df = pd.read_csv(csv_filepath, encoding='latin1') columns = list(df.columns.values) columnsize = len(columns) fh.write(f"D:Columns: {columns} \n") fh.flush() for index, el in df.iterrows(): if random.randint(1, 10) > 1: continue for i in range(3): c = random.randint(0, columnsize - 1) target_column = random.randint(0, columnsize - 1) #SHOULD I CHECK IF if c == target_column: continue # try: value = dpu.encode_cell(el[c]) if len( value ) < 4 or "/" in value: #We're only going 1 direction with the testing data AND NO DATES continue expected = dpu.encode_cell(el[target_column]) # print(value,expected) try: res = api.concept_qa(value, csv_file, columns[target_column], n=RELEVANTS[-1]) y = 0 ind = 0
def similarity_between(self, entity1, entity2, simf=SIMF.COSINE): x = dpu.encode_cell(entity1) y = dpu.encode_cell(entity2) vec_x = self.M.get_vector(x) vec_y = self.M.get_vector(y) return self.similarity_between_vectors(vec_x, vec_y, simf=simf)