class BaseTestingPipeline(object): """ Base class for testing pipelines """ def __init__(self, retrieval_class=BaseRetrieval, use_celery=False): # This points to the the class of retrieval we are using self.retrieval_class = retrieval_class self.use_celery = use_celery self.tasks = [] self.exp = None self.options = None self.precomputed_queries = [] self.tfidfmodels = {} self.files_dict = {} self.main_all_doc_methods = {} self.current_all_doc_methods = {} self.save_terms = False self.max_per_class_results = 1000 def loadModel(self, guid): """ Loads all the retrieval models for a single file """ for model in self.files_dict[guid]["tfidf_models"]: # create a search instance for each method self.tfidfmodels[model["method"]] = self.retrieval_class( model["actual_dir"], model["method"], logger=None, use_default_similarity=self.exp["use_default_similarity"], ) def generateRetrievalModels(self, all_doc_methods, all_files): """ Generates the files_dict with the paths to the retrieval models """ for guid in all_files: self.files_dict[guid]["tfidf_models"] = [] for method in all_doc_methods: actual_dir = cp.Corpus.getRetrievalIndexPath( guid, all_doc_methods[method]["index_filename"], self.exp["full_corpus"] ) self.files_dict[guid]["tfidf_models"].append({"method": method, "actual_dir": actual_dir}) def addRandomControlResult(self, guid, precomputed_query): """ Adds a result that is purely based on analytical chance, for comparison. """ result_dict = { "file_guid": guid, "citation_id": precomputed_query["citation_id"], "doc_position": precomputed_query["doc_position"], "query_method": precomputed_query["query_method"], "match_guid": precomputed_query["match_guid"], "doc_method": "RANDOM", "mrr_score": analyticalRandomChanceMRR(self.files_dict[guid]["in_collection_references"]), "precision_score": 1 / float(self.files_dict[guid]["in_collection_references"]), "ndcg_score": 0, "rank": 0, "first_result": "", } # Deal here with CoreSC/AZ/CFC annotation for annotation in self.exp.get("rhetorical_annotations", []): result_dict[annotation] = precomputed_query.get(annotation) self.logger.addResolutionResultDict(result_dict) def initializePipeline(self): """ Whatever needs to happen before we start the pipeline: inializing connections, VMs, whatever. This function should be overriden by descendant classes if anything is to be done. """ if self.retrieval_class.__name__.startswith("Lucene"): import lucene try: lucene.initVM(maxheap="640m") # init Lucene VM except ValueError: # VM already up print(sys.exc_info()[1]) def startLogging(self): """ Creates the results logger and starts counting and logging. """ output_filename = os.path.join(self.exp["exp_dir"], self.exp.get("output_filename", "results.csv")) self.logger = ResultsLogger( False, dump_filename=output_filename, message_text="Running precomputed queries", dot_every_xitems=1 ) # init all the logging/counting self.logger.startCounting() # for timing the process, start now def loadQueriesAndFileList(self): """ Loads the precomputed queries and the list of test files to process. """ precomputed_queries_file_path = self.exp.get("precomputed_queries_file_path", None) if not precomputed_queries_file_path: precomputed_queries_file_path = os.path.join( self.exp["exp_dir"], self.exp.get("precomputed_queries_filename", "precomputed_queries.json") ) self.precomputed_queries = json.load(open(precomputed_queries_file_path, "r")) files_dict_filename = os.path.join(self.exp["exp_dir"], self.exp.get("files_dict_filename", "files_dict.json")) self.files_dict = json.load(open(files_dict_filename, "r")) self.files_dict["ALL_FILES"] = {} def populateMethods(self): """ Fills dict with all the test methods, parameters and options, including the retrieval instances """ self.tfidfmodels = {} all_doc_methods = None if self.exp.get("doc_methods", None): all_doc_methods = getDictOfTestingMethods(self.exp["doc_methods"]) # essentially this overrides whatever is in files_dict, if testing_methods was passed as parameter if self.exp["full_corpus"]: all_files = ["ALL_FILES"] else: all_files = self.files_dict.keys() self.generateRetrievalModels(all_doc_methods, all_files) else: all_doc_methods = self.files_dict["ALL_FILES"]["doc_methods"] # load from files_dict if self.exp["full_corpus"]: for model in self.files_dict["ALL_FILES"]["tfidf_models"]: # create a search instance for each method self.tfidfmodels[model["method"]] = self.retrieval_class( model["actual_dir"], model["method"], logger=None, use_default_similarity=self.exp["use_default_similarity"], max_results=self.exp["max_results_recall"], save_terms=self.save_terms, ) self.main_all_doc_methods = all_doc_methods def newResultDict(self, guid, precomputed_query, doc_method): """ Creates and populates a new result dict. """ result_dict = { "file_guid": guid, "citation_id": precomputed_query["citation_id"], "doc_position": precomputed_query["doc_position"], "query_method": precomputed_query["query_method"], "doc_method": doc_method, "match_guid": precomputed_query["match_guid"], } # Deal here with CoreSC/AZ/CFC annotation for annotation in self.exp.get("rhetorical_annotations", []): result_dict[annotation] = precomputed_query.get(annotation) return result_dict def addEmptyResult(self, guid, precomputed_query, doc_method): """ Adds an empty result, that is, a result with 0 score due to some error. """ result_dict = self.newResultDict(guid, precomputed_query, doc_method) result_dict["mrr_score"] = 0 result_dict["precision_score"] = 0 result_dict["ndcg_score"] = 0 result_dict["rank"] = 0 result_dict["first_result"] = "" self.logger.addResolutionResultDict(result_dict) def addResult(self, guid, precomputed_query, doc_method, retrieved_results): """ Adds a normal (successful) result to the result log. """ result_dict = self.newResultDict(guid, precomputed_query, doc_method) self.logger.measureScoreAndLog(retrieved_results, precomputed_query["citation_multi"], result_dict) ## rank_per_method[result["doc_method"]].append(result["rank"]) ## precision_per_method[result["doc_method"]].append(result["precision_score"]) ## def logTextAndReferences(self, doctext, queries, qmethod): ## """ ## Extra logging, not used right now ## """ ## pre_selection_text=doctext[queries[qmethod]["left_start"]-300:queries[qmethod]["left_start"]] ## draft_text=doctext[queries[qmethod]["left_start"]:queries[qmethod]["right_end"]] ## post_selection_text=doctext[queries[qmethod]["right_end"]:queries[qmethod]["left_start"]+300] ## draft_text=u"<span class=document_text>{}</span> <span class=selected_text>{}</span> <span class=document_text>{}</span>".format(pre_selection_text, draft_text, post_selection_text) ## print(draft_text) def saveResultsAndCleanUp(self): """ This executes after the whole pipeline is done. This is where we save all data that needs to be saved, report statistics, etc. """ self.logger.writeDataToCSV() self.logger.showFinalSummary() def processOneQuery(self, precomputed_query): """ Runs the retrieval and evaluation for a single query """ if self.exp.get("queries_classification", "") not in ["", None]: q_type = precomputed_query[self.exp.get("queries_classification")] if self.per_class_count[q_type] < self.max_per_class_results: self.per_class_count[q_type] += 1 else: print("Too many queries of type %s already" % q_type) return guid = precomputed_query["file_guid"] self.logger.total_citations += self.files_dict[guid]["resolvable_citations"] all_doc_methods = deepcopy(self.main_all_doc_methods) # If we're running per-file resolution and we are now on a different file, load its model if not self.exp["full_corpus"] and guid != self.previous_guid: self.previous_guid = guid self.loadModel(guid) # create a dict where every field gets a weight of 1 for method in self.main_all_doc_methods: all_doc_methods[method]["runtime_parameters"] = { x: 1 for x in self.main_all_doc_methods[method]["runtime_parameters"] } self.current_all_doc_methods = all_doc_methods # for every method used for extracting BOWs for doc_method in all_doc_methods: # Log everything if the logger is enabled ## self.logger.logReport("Citation: "+precomputed_query["citation_id"]+"\n Query method:"+precomputed_query["query_method"]+" \nDoc method: "+doc_method +"\n") ## self.logger.logReport(precomputed_query["query_text"]+"\n") # ACTUAL RETRIEVAL HAPPENING - run query retrieved = self.tfidfmodels[doc_method].runQuery( precomputed_query, addExtraWeights(all_doc_methods[doc_method]["runtime_parameters"], self.exp), guid, max_results=exp.get("max_results_recall", MAX_RESULTS_RECALL), ) if not retrieved: # the query was empty or something self.addEmptyResult(guid, precomputed_query, doc_method) else: self.addResult(guid, precomputed_query, doc_method, retrieved) if self.exp.get("add_random_control_result", False): self.addRandomControlResult(guid, precomputed_query) self.logger.showProgressReport(guid) # prints out info on how it's going def processAllQueries(self): """ MAIN LOOP over all precomputed queries """ for precomputed_query in self.precomputed_queries: self.processOneQuery(precomputed_query) def runPipeline(self, exp, options): """ Run the whole experiment pipeline, loading everything from precomputed json :param exp: experiment dict """ self.exp = exp self.options = options self.max_per_class_results = self.exp.get("max_per_class_results", self.max_per_class_results) self.per_class_count = defaultdict(lambda: 0) if self.exp.get("similiarity_tie_breaker", 0): for model in self.tfidfmodels.items(): model.tie_breaker = self.exp["similiarity_tie_breaker"] self.startLogging() self.initializePipeline() self.loadQueriesAndFileList() self.logger.setNumItems(len(self.precomputed_queries)) self.populateMethods() self.previous_guid = "" # MAIN LOOP over all precomputed queries self.processAllQueries() self.saveResultsAndCleanUp()
def measurePrecomputedResolution(self, retrieval_results, method, parameters, citation_az="*"): """ This is kind of like measureCitationResolution: it takes a list of precomputed retrieval_results, then applies the new parameters to them. This is how we recompute what Lucene gives us, avoiding having to call Lucene again and so speeding it up a lot. All we need to do is adjust the weights on the already available explanation formulas. """ logger = ResultsLogger(False, dump_straight_to_disk=False) # init all the logging/counting logger.startCounting() # for timing the process, start now logger.setNumItems(len(retrieval_results), print_out=False) # for each query-result: (results are packed inside each query for each method) for result in retrieval_results: # select only the method we're testing for if "formulas" not in result: # there was an error reading this result continue formulas = result["formulas"] retrieved = runPrecomputedQuery(formulas, parameters) result_dict = { "file_guid": result["file_guid"], "citation_id": result["citation_id"], "doc_position": result["doc_position"], "query_method": result["query_method"], "doc_method": method, "az": result["az"], "cfc": result["cfc"], "match_guid": result["match_guid"], } if not retrieved or len(retrieved) == 0: # the query was empty or something ## print "Error: ", doc_method , qmethod,tfidfmodels[method].indexDir ## logger.addResolutionResult(guid,m,doc_position,qmethod,doc_method ,0,0,0) result_dict["mrr_score"] = 0 result_dict["precision_score"] = 0 result_dict["ndcg_score"] = 0 result_dict["rank"] = 0 result_dict["first_result"] = "" logger.addResolutionResultDict(result_dict) else: result = logger.measureScoreAndLog(retrieved, result["citation_multi"], result_dict) logger.computeAverageScores() results = [] for query_method in logger.averages: for doc_method in logger.averages[query_method]: weights = parameters data_line = {"query_method": query_method, "doc_method": doc_method, "citation_az": citation_az} for metric in logger.averages[query_method][doc_method]: data_line["avg_" + metric] = logger.averages[query_method][doc_method][metric] data_line["precision_total"] = logger.scores["precision"][query_method][doc_method] results.append(data_line) return results