def eval_similarity(query_data): # do actually evaluate similarity .... query, start_idx, expressions = query_data csv_reader = csv.reader(expressions, delimiter='\t', lineterminator='\n', quoting=csv.QUOTE_NONE, escapechar="\\") end_idx = start_idx + len(expressions) - 1 #create query slt query_name, query_expression = query query_tree = SymbolTree.parse_from_slt(query_expression) query_constraints = Query.create_default_constraints(query_tree) results = [] for idx, parts in enumerate(csv_reader): #for idx, expression_info in enumerate(expressions): #parts = expression_info.strip().split("\t") expression = parts[0] doc_id = parts[1] location = parts[2] candidate_tree = SymbolTree.parse_from_slt(expression) try: data = SIM_FUNCTION(query_tree, candidate_tree, query_constraints) scores = data[0] except: print("Error processing: ") print(query_expression, flush=True) print(expression, flush=True) print("Doc: " + doc_id, flush=True) print("Loc: " + location, flush=True) continue # the index is only returned because some expressions might be absent in case of errors results.append((scores, start_idx + idx)) print("Processed: " + str(start_idx) + " to " + str(end_idx) + " finished", flush=True) return results
def eval_similarity(query_data): # do actually evaluate similarity .... query, start_idx, expressions = query_data end_idx = start_idx + len(expressions) - 1 #create query slt query_name, query_expression = query query_tree = SymbolTree.parse_from_slt(query_expression) query_constraints = Query.create_default_constraints(query_tree) results = [] for idx, expression_info in enumerate(expressions): parts = expression_info.strip().split("\t") expression = parts[0] doc_id = parts[1] location = parts[2] candidate_tree = SymbolTree.parse_from_slt(expression) try: scores, matched_q, matched_c, unified_c = similarity_v04(query_tree, candidate_tree, query_constraints) except: print("Error processing: ") print(query_expression, flush=True) print(expression, flush=True) print("Doc: " + doc_id, flush=True) print("Loc: " + location, flush=True) continue # the index is only returned because some expressions might be absent in case of errors results.append((scores, start_idx + idx)) print("Processed: " + str(start_idx) + " to " + str(end_idx) + " finished", flush=True) return results
def pivot_by_docs(self, how): # process all query results """ how = "core" => use core value ranks directly "MSS" => use reranking scores """ self.by_document = {} ## intID = True # CHANGED TO MATCH ON DOC NAME ALWAYS if self.tquery: for doc_id in self.tquery.results.keys(): ## try: ## intID = (int(doc_id) == doc_id) # True if doc_id is an integer ## except: ## intID = False # otherwise need to match on filename (docname, score, positions) = self.tquery.results[doc_id] # add document if first time seen # join on docname, not doc_id try: doc = self.by_document[docname] except: doc = CompQueryResult(doc_id, docname) self.by_document[docname] = doc # add score of keyword match to current document doc.set_tscore(score) doc.set_tpos(positions) if self.mqueries: for qexprnum, query in enumerate(self.mqueries): # keep scores for all existing formulas over all documents for result in query.results.values(): # N.B. only one Result structure per matched formula expression #print("Candidate: " + result.tree.tostring(), flush=True) if how == "MSS": # compute the MSS score if requested sim_res = similarity_v06( query.tree, result.tree, Query.create_default_constraints(query.tree)) result.new_scores = sim_res[ 0] # scores returned as first component of result -- other components are node sets elif how == "v09": sim_res = similarity_v09( query.tree, result.tree, Query.create_default_constraints(query.tree)) result.new_scores = sim_res[0] # only use scores elif how == "v10": sim_res = similarity_v10( query.tree, result.tree, Query.create_default_constraints(query.tree)) result.new_scores = sim_res[0] # only use scores elif how == "v11": sim_res = similarity_v11( query.tree, result.tree, Query.create_default_constraints(query.tree)) result.new_scores = sim_res[0] # only use scores else: result.new_scores = [ result.original_score ] # otherwise, just use original score for doc_id, offset in result.locations: title = query.documents[doc_id] #title = title.rpartition('\\')[2] # just last part title = os.path.basename(title) # just last part (KMD) ## if not intID: # join on title instead of doc_id joiner = title ## else: ## joiner = doc_id # add document if first time seen try: doc = self.by_document[joiner] doc.doc_id = doc_id # prefer using math ids (to match positions later) except: doc = CompQueryResult(doc_id, title) self.by_document[joiner] = doc # add current result to current document doc.add_mscore(qexprnum, result)