Python ResultsLogger Examples

Programming Language: Python

Namespace/Package Name: minerva.proc.results_logging

Class/Type: ResultsLogger

Examples at hotexamples.com: 3

Python ResultsLogger - 3 examples found. These are the top rated real world Python examples of minerva.proc.results_logging.ResultsLogger extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

addResolutionResultDict(2)

measureScoreAndLog(2)

setNumItems(2)

startCounting(2)

computeAverageScores(1)

showFinalSummary(1)

showProgressReport(1)

writeDataToCSV(1)

Example #1

Show file

File: base_pipeline.py Project: danieldmm/minerva

 def startLogging(self):
     """
         Creates the results logger and starts counting and logging.
     """
     output_filename = os.path.join(self.exp["exp_dir"], self.exp.get("output_filename", "results.csv"))
     self.logger = ResultsLogger(
         False, dump_filename=output_filename, message_text="Running precomputed queries", dot_every_xitems=1
     )  # init all the logging/counting
     self.logger.startCounting()  # for timing the process, start now

Example #2

Show file

File: base_pipeline.py Project: danieldmm/minerva

class BaseTestingPipeline(object):
    """
        Base class for testing pipelines
    """

    def __init__(self, retrieval_class=BaseRetrieval, use_celery=False):
        # This points to the the class of retrieval we are using
        self.retrieval_class = retrieval_class
        self.use_celery = use_celery
        self.tasks = []
        self.exp = None
        self.options = None
        self.precomputed_queries = []
        self.tfidfmodels = {}
        self.files_dict = {}
        self.main_all_doc_methods = {}
        self.current_all_doc_methods = {}
        self.save_terms = False
        self.max_per_class_results = 1000

    def loadModel(self, guid):
        """
            Loads all the retrieval models for a single file
        """
        for model in self.files_dict[guid]["tfidf_models"]:
            # create a search instance for each method
            self.tfidfmodels[model["method"]] = self.retrieval_class(
                model["actual_dir"],
                model["method"],
                logger=None,
                use_default_similarity=self.exp["use_default_similarity"],
            )

    def generateRetrievalModels(self, all_doc_methods, all_files):
        """
            Generates the files_dict with the paths to the retrieval models
        """
        for guid in all_files:
            self.files_dict[guid]["tfidf_models"] = []
            for method in all_doc_methods:
                actual_dir = cp.Corpus.getRetrievalIndexPath(
                    guid, all_doc_methods[method]["index_filename"], self.exp["full_corpus"]
                )
                self.files_dict[guid]["tfidf_models"].append({"method": method, "actual_dir": actual_dir})

    def addRandomControlResult(self, guid, precomputed_query):
        """
             Adds a result that is purely based on analytical chance, for
             comparison.
        """
        result_dict = {
            "file_guid": guid,
            "citation_id": precomputed_query["citation_id"],
            "doc_position": precomputed_query["doc_position"],
            "query_method": precomputed_query["query_method"],
            "match_guid": precomputed_query["match_guid"],
            "doc_method": "RANDOM",
            "mrr_score": analyticalRandomChanceMRR(self.files_dict[guid]["in_collection_references"]),
            "precision_score": 1 / float(self.files_dict[guid]["in_collection_references"]),
            "ndcg_score": 0,
            "rank": 0,
            "first_result": "",
        }

        # Deal here with CoreSC/AZ/CFC annotation
        for annotation in self.exp.get("rhetorical_annotations", []):
            result_dict[annotation] = precomputed_query.get(annotation)

        self.logger.addResolutionResultDict(result_dict)

    def initializePipeline(self):
        """
            Whatever needs to happen before we start the pipeline: inializing
            connections, VMs, whatever.

            This function should be overriden by descendant classes if anything
            is to be done.
        """
        if self.retrieval_class.__name__.startswith("Lucene"):
            import lucene

            try:
                lucene.initVM(maxheap="640m")  # init Lucene VM
            except ValueError:
                # VM already up
                print(sys.exc_info()[1])

    def startLogging(self):
        """
            Creates the results logger and starts counting and logging.
        """
        output_filename = os.path.join(self.exp["exp_dir"], self.exp.get("output_filename", "results.csv"))
        self.logger = ResultsLogger(
            False, dump_filename=output_filename, message_text="Running precomputed queries", dot_every_xitems=1
        )  # init all the logging/counting
        self.logger.startCounting()  # for timing the process, start now

    def loadQueriesAndFileList(self):
        """
            Loads the precomputed queries and the list of test files to process.
        """
        precomputed_queries_file_path = self.exp.get("precomputed_queries_file_path", None)
        if not precomputed_queries_file_path:
            precomputed_queries_file_path = os.path.join(
                self.exp["exp_dir"], self.exp.get("precomputed_queries_filename", "precomputed_queries.json")
            )
        self.precomputed_queries = json.load(open(precomputed_queries_file_path, "r"))
        files_dict_filename = os.path.join(self.exp["exp_dir"], self.exp.get("files_dict_filename", "files_dict.json"))
        self.files_dict = json.load(open(files_dict_filename, "r"))
        self.files_dict["ALL_FILES"] = {}

    def populateMethods(self):
        """
            Fills dict with all the test methods, parameters and options, including
            the retrieval instances
        """
        self.tfidfmodels = {}
        all_doc_methods = None

        if self.exp.get("doc_methods", None):
            all_doc_methods = getDictOfTestingMethods(self.exp["doc_methods"])
            # essentially this overrides whatever is in files_dict, if testing_methods was passed as parameter

            if self.exp["full_corpus"]:
                all_files = ["ALL_FILES"]
            else:
                all_files = self.files_dict.keys()

            self.generateRetrievalModels(all_doc_methods, all_files)
        else:
            all_doc_methods = self.files_dict["ALL_FILES"]["doc_methods"]  # load from files_dict

        if self.exp["full_corpus"]:
            for model in self.files_dict["ALL_FILES"]["tfidf_models"]:
                # create a search instance for each method
                self.tfidfmodels[model["method"]] = self.retrieval_class(
                    model["actual_dir"],
                    model["method"],
                    logger=None,
                    use_default_similarity=self.exp["use_default_similarity"],
                    max_results=self.exp["max_results_recall"],
                    save_terms=self.save_terms,
                )

        self.main_all_doc_methods = all_doc_methods

    def newResultDict(self, guid, precomputed_query, doc_method):
        """
            Creates and populates a new result dict.
        """
        result_dict = {
            "file_guid": guid,
            "citation_id": precomputed_query["citation_id"],
            "doc_position": precomputed_query["doc_position"],
            "query_method": precomputed_query["query_method"],
            "doc_method": doc_method,
            "match_guid": precomputed_query["match_guid"],
        }

        # Deal here with CoreSC/AZ/CFC annotation
        for annotation in self.exp.get("rhetorical_annotations", []):
            result_dict[annotation] = precomputed_query.get(annotation)

        return result_dict

    def addEmptyResult(self, guid, precomputed_query, doc_method):
        """
            Adds an empty result, that is, a result with 0 score due to some error.
        """
        result_dict = self.newResultDict(guid, precomputed_query, doc_method)
        result_dict["mrr_score"] = 0
        result_dict["precision_score"] = 0
        result_dict["ndcg_score"] = 0
        result_dict["rank"] = 0
        result_dict["first_result"] = ""
        self.logger.addResolutionResultDict(result_dict)

    def addResult(self, guid, precomputed_query, doc_method, retrieved_results):
        """
            Adds a normal (successful) result to the result log.
        """
        result_dict = self.newResultDict(guid, precomputed_query, doc_method)
        self.logger.measureScoreAndLog(retrieved_results, precomputed_query["citation_multi"], result_dict)

    ##        rank_per_method[result["doc_method"]].append(result["rank"])
    ##        precision_per_method[result["doc_method"]].append(result["precision_score"])

    ##    def logTextAndReferences(self, doctext, queries, qmethod):
    ##        """
    ##            Extra logging, not used right now
    ##        """
    ##        pre_selection_text=doctext[queries[qmethod]["left_start"]-300:queries[qmethod]["left_start"]]
    ##        draft_text=doctext[queries[qmethod]["left_start"]:queries[qmethod]["right_end"]]
    ##        post_selection_text=doctext[queries[qmethod]["right_end"]:queries[qmethod]["left_start"]+300]
    ##        draft_text=u"<span class=document_text>{}</span> <span class=selected_text>{}</span> <span class=document_text>{}</span>".format(pre_selection_text, draft_text, post_selection_text)
    ##        print(draft_text)

    def saveResultsAndCleanUp(self):
        """
            This executes after the whole pipeline is done. This is where we
            save all data that needs to be saved, report statistics, etc.
        """
        self.logger.writeDataToCSV()
        self.logger.showFinalSummary()

    def processOneQuery(self, precomputed_query):
        """
            Runs the retrieval and evaluation for a single query
        """
        if self.exp.get("queries_classification", "") not in ["", None]:
            q_type = precomputed_query[self.exp.get("queries_classification")]
            if self.per_class_count[q_type] < self.max_per_class_results:
                self.per_class_count[q_type] += 1
            else:
                print("Too many queries of type %s already" % q_type)
                return

        guid = precomputed_query["file_guid"]
        self.logger.total_citations += self.files_dict[guid]["resolvable_citations"]

        all_doc_methods = deepcopy(self.main_all_doc_methods)

        # If we're running per-file resolution and we are now on a different file, load its model
        if not self.exp["full_corpus"] and guid != self.previous_guid:
            self.previous_guid = guid
            self.loadModel(guid)

        # create a dict where every field gets a weight of 1
        for method in self.main_all_doc_methods:
            all_doc_methods[method]["runtime_parameters"] = {
                x: 1 for x in self.main_all_doc_methods[method]["runtime_parameters"]
            }

        self.current_all_doc_methods = all_doc_methods

        # for every method used for extracting BOWs
        for doc_method in all_doc_methods:
            # Log everything if the logger is enabled
            ##                self.logger.logReport("Citation: "+precomputed_query["citation_id"]+"\n Query method:"+precomputed_query["query_method"]+" \nDoc method: "+doc_method +"\n")
            ##                self.logger.logReport(precomputed_query["query_text"]+"\n")

            # ACTUAL RETRIEVAL HAPPENING - run query
            retrieved = self.tfidfmodels[doc_method].runQuery(
                precomputed_query,
                addExtraWeights(all_doc_methods[doc_method]["runtime_parameters"], self.exp),
                guid,
                max_results=exp.get("max_results_recall", MAX_RESULTS_RECALL),
            )

            if not retrieved:  # the query was empty or something
                self.addEmptyResult(guid, precomputed_query, doc_method)
            else:
                self.addResult(guid, precomputed_query, doc_method, retrieved)

        if self.exp.get("add_random_control_result", False):
            self.addRandomControlResult(guid, precomputed_query)

        self.logger.showProgressReport(guid)  # prints out info on how it's going

    def processAllQueries(self):
        """
            MAIN LOOP over all precomputed queries
        """
        for precomputed_query in self.precomputed_queries:
            self.processOneQuery(precomputed_query)

    def runPipeline(self, exp, options):
        """
            Run the whole experiment pipeline, loading everything from
            precomputed json

            :param exp: experiment dict
        """
        self.exp = exp
        self.options = options

        self.max_per_class_results = self.exp.get("max_per_class_results", self.max_per_class_results)
        self.per_class_count = defaultdict(lambda: 0)
        if self.exp.get("similiarity_tie_breaker", 0):
            for model in self.tfidfmodels.items():
                model.tie_breaker = self.exp["similiarity_tie_breaker"]

        self.startLogging()
        self.initializePipeline()
        self.loadQueriesAndFileList()
        self.logger.setNumItems(len(self.precomputed_queries))
        self.populateMethods()

        self.previous_guid = ""

        # MAIN LOOP over all precomputed queries
        self.processAllQueries()

        self.saveResultsAndCleanUp()

Example #3

Show file

File: weight_training.py Project: danieldmm/minerva

    def measurePrecomputedResolution(self, retrieval_results, method, parameters, citation_az="*"):
        """
            This is kind of like measureCitationResolution:
            it takes a list of precomputed retrieval_results, then applies the new
            parameters to them. This is how we recompute what Lucene gives us,
            avoiding having to call Lucene again and so speeding it up a lot.

            All we need to do is adjust the weights on the already available
            explanation formulas.
        """
        logger = ResultsLogger(False, dump_straight_to_disk=False)  # init all the logging/counting
        logger.startCounting()  # for timing the process, start now

        logger.setNumItems(len(retrieval_results), print_out=False)

        # for each query-result: (results are packed inside each query for each method)
        for result in retrieval_results:
            # select only the method we're testing for
            if "formulas" not in result:
                # there was an error reading this result
                continue

            formulas = result["formulas"]
            retrieved = runPrecomputedQuery(formulas, parameters)

            result_dict = {
                "file_guid": result["file_guid"],
                "citation_id": result["citation_id"],
                "doc_position": result["doc_position"],
                "query_method": result["query_method"],
                "doc_method": method,
                "az": result["az"],
                "cfc": result["cfc"],
                "match_guid": result["match_guid"],
            }

            if not retrieved or len(retrieved) == 0:  # the query was empty or something
                ##                print "Error: ", doc_method , qmethod,tfidfmodels[method].indexDir
                ##                logger.addResolutionResult(guid,m,doc_position,qmethod,doc_method ,0,0,0)
                result_dict["mrr_score"] = 0
                result_dict["precision_score"] = 0
                result_dict["ndcg_score"] = 0
                result_dict["rank"] = 0
                result_dict["first_result"] = ""

                logger.addResolutionResultDict(result_dict)
            else:
                result = logger.measureScoreAndLog(retrieved, result["citation_multi"], result_dict)

        logger.computeAverageScores()
        results = []
        for query_method in logger.averages:
            for doc_method in logger.averages[query_method]:
                weights = parameters
                data_line = {"query_method": query_method, "doc_method": doc_method, "citation_az": citation_az}

                for metric in logger.averages[query_method][doc_method]:
                    data_line["avg_" + metric] = logger.averages[query_method][doc_method][metric]
                data_line["precision_total"] = logger.scores["precision"][query_method][doc_method]

                results.append(data_line)

        return results