예제 #1
0
    def trainWeights(self):
        """
            Run the final stage of the weight training pipeline.
        """
        gc.collect()
        options = self.options
        self.all_doc_methods = getDictOfTestingMethods(self.exp["doc_methods"])

        best_weights = {}
        if options.get("override_folds", None):
            self.exp["cross_validation_folds"] = options["override_folds"]

        if options.get("override_metric", None):
            self.exp["metric"] = options["override_metric"]

        numfolds = self.exp.get("cross_validation_folds", 2)

        # First we find the highest weights for each fold's training set
        for split_fold in range(numfolds):
            print("\nFold #" + str(split_fold))
            best_weights[split_fold] = self.dynamicWeightValues(split_fold)
            gc.collect()

        # Then we actually test them against the
        print("Now applying and testing weights...\n")
        self.measureScoresOfWeights(best_weights)
예제 #2
0
    def trainKeywords(self):
        """
            Run the final stage of the training pipeline
        """
        gc.collect()
        options=self.options
        self.all_doc_methods=getDictOfTestingMethods(self.exp["doc_methods"])

        best_keywords={}
        if options.get("override_folds",None):
            self.exp["cross_validation_folds"]=options["override_folds"]

        if options.get("override_metric",None):
            self.exp["metric"]=options["override_metric"]

        numfolds=self.exp.get("cross_validation_folds",2)

        # First we train a keyword extractor from each fold's training set
        for split_fold in range(numfolds):
            print("\nFold #"+str(split_fold))
            trained_extractors[split_fold]=self.trainExtractor(split_fold)
            gc.collect()

        # Then we actually test them against the
        print("Now applying and testing keywords...\n")
        self.measureScoresOfKeywords(best_keywords)
예제 #3
0
    def trainExtractor(self, split_fold):
        """
            Train an extractor for the given fold
        """
        all_doc_methods=getDictOfTestingMethods(self.exp["doc_methods"])
        annotated_boost_methods=[x for x in all_doc_methods if all_doc_methods[x]["type"] in ["annotated_boost"]]

        numfolds=self.exp.get("cross_validation_folds",2)

        retrieval_results=self.loadPrecomputedFormulas()
        if len(retrieval_results) == 0:
            print("No precomputed formulas for ",)
            continue

        if len(retrieval_results) < numfolds:
            print("Number of results is smaller than number of folds for zone type ", query_type)
            continue

        cv = cross_validation.KFold(len(retrieval_results), n_folds=numfolds, shuffle=False, random_state=None) # indices=True, k=None
        cv=[k for k in cv]

        traincv, testcv=cv[split_fold]
        if isinstance(retrieval_results, ResultIncrementalReader):
            train_set=retrieval_results.subset(traincv)
        elif isinstance(retrieval_results, list):
            train_set=[retrieval_results[i] for i in traincv]
        else:
            raise ValueError("Unkown class of results")
##            train_set=retrieval_results.subset(traincv)
##            train_set=[retrieval_results[i] for i in traincv]
        if len(train_set) == 0:
            print("Training set len is 0!")
            return defaultdict(lambda:1)

        print("Training for %d/%d citations " % (len(train_set),len(retrieval_results)))
        trained_models={}
        for method in all_doc_methods:
            res={}
            # what to do with the runtime_parameters?
##            all_doc_methods[method]["runtime_parameters"]=weights
            trained_models[method]=TFIDFKeywordExtractor()
            trained_models[method].train(train_set)

        return trained_models
예제 #4
0
    def precomputeQueries(self,exp):
        """
            Precompute all queries for all annotated citation contexts

            :param exp: experiment dict with all options
            :type exp: dict
        """
        self.exp=exp
        print("Precomputing queries...")
        logger=ProgressIndicator(True, numitems=len(exp["test_files"])) # init all the logging/counting
        logger.numchunks=exp.get("numchunks",10)

        cp.Corpus.loadAnnotators()

        # convert nested dict to flat dict where each method includes its parameters in the name
        self.all_doc_methods=getDictOfTestingMethods(exp["doc_methods"])

        self.precomputed_queries=[]
        self.files_dict=OrderedDict()

##        if exp["full_corpus"]:
##            files_dict["ALL_FILES"]={}
##            files_dict["ALL_FILES"]["doc_methods"]=all_doc_methods
##            files_dict["ALL_FILES"]["tfidf_models"]=[]
##            for method in all_doc_methods:
##                actual_dir=cp.Corpus.getRetrievalIndexPath("ALL_FILES",all_doc_methods[method]["index_filename"],exp["full_corpus"])
##                files_dict["ALL_FILES"]["tfidf_models"].append({"method":method,"actual_dir":actual_dir})

        #===================================
        # MAIN LOOP over all testing files
        #===================================
        for guid in exp["test_files"]:
            try:
                self.processOneFile(guid)
            except ValueError:
                print("Can't load SciDoc ",guid)
                continue

            logger.showProgressReport(guid) # prints out info on how it's going

        self.saveAllQueries()
        print("Precomputed queries saved.")
예제 #5
0
    def dynamicWeightValues(self, split_fold):
        """
            Find the best combination of weights using a greedy heuristic, not
            testing every possible one, but selecting the best one at each stage
        """
        all_doc_methods = getDictOfTestingMethods(self.exp["doc_methods"])
        annotated_boost_methods = [x for x in all_doc_methods if all_doc_methods[x]["type"] in ["annotated_boost"]]

        initialization_methods = [1]
        ##    initialization_methods=[1,"random"]
        MIN_WEIGHT = 0
        ##    self.exp["movements"]=[-1,3]
        self.exp["movements"] = [-1, 6, -2]

        best_weights = {}

        numfolds = self.exp.get("cross_validation_folds", 2)
        ##    counter=weightCounterList(exp["weight_values"])

        print("Processing zones ", self.exp["train_weights_for"])

        for query_type in self.exp["train_weights_for"]:
            best_weights[query_type] = {}
            results_compare = []

            retrieval_results = self.loadPrecomputedFormulas(query_type)
            if len(retrieval_results) == 0:
                print("No precomputed formulas for ", query_type)
                continue

            if len(retrieval_results) < numfolds:
                print("Number of results is smaller than number of folds for zone type ", query_type)
                continue

            cv = cross_validation.KFold(
                len(retrieval_results), n_folds=numfolds, shuffle=False, random_state=None
            )  # indices=True, k=None
            cv = [k for k in cv]

            traincv, testcv = cv[split_fold]
            if isinstance(retrieval_results, ResultIncrementalReader):
                train_set = retrieval_results.subset(traincv)
            elif isinstance(retrieval_results, list):
                train_set = [retrieval_results[i] for i in traincv]
            else:
                raise ValueError("Unkown class of results")
            ##            train_set=retrieval_results.subset(traincv)
            ##            train_set=[retrieval_results[i] for i in traincv]
            if len(train_set) == 0:
                print("Training set len is 0!")
                return defaultdict(lambda: 1)

            print("Training for citations in ", query_type, "zones:", len(train_set), "/", len(retrieval_results))
            for method in annotated_boost_methods:
                res = {}

                for weight_initalization in initialization_methods:
                    if weight_initalization == 1:
                        ##                    counter.initWeights(all_doc_methods[method]["runtime_parameters"])
                        weights = {x: 1 for x in all_doc_methods[method]["runtime_parameters"]}
                    elif weight_initalization == "random":
                        weights = {x: random.randint(-10, 10) for x in all_doc_methods[method]["runtime_parameters"]}
                    ##                    counter.weights={x:random.randint(-10,10) for x in all_doc_methods[method]["runtime_parameters"]}

                    all_doc_methods[method]["runtime_parameters"] = weights
                    print("Computing initial score...")
                    scores = self.measurePrecomputedResolution(
                        train_set, method, addExtraWeights(weights, self.exp), query_type
                    )

                    score_baseline = scores[0][self.exp["metric"]]
                    previous_score = score_baseline
                    first_baseline = score_baseline
                    score_progression = [score_baseline]

                    global GLOBAL_FILE_COUNTER
                    ##                    drawWeights(self.exp,weights,query_type+"_weights_"+str(GLOBAL_FILE_COUNTER))
                    ##                    drawScoreProgression(self.exp,score_progression,query_type+"_"+str(GLOBAL_FILE_COUNTER))
                    GLOBAL_FILE_COUNTER += 1

                    overall_improvement = score_baseline
                    passes = 0

                    print("Finding best weights...")
                    while passes < 3 or overall_improvement > 0:
                        for direction in self.exp["movements"]:  # [-1,6,-2]
                            print("Direction: ", direction)
                            for index in range(len(weights)):
                                ##                                print("Weight: ", index)
                                weight_name = weights.keys()[index]
                                prev_weight = weights[weight_name]
                                # hard lower limit of 0 for weights
                                weights[weight_name] = max(MIN_WEIGHT, weights[weight_name] + direction)

                                scores = self.measurePrecomputedResolution(
                                    train_set, method, addExtraWeights(weights, self.exp), query_type
                                )
                                this_score = scores[0][self.exp["metric"]]

                                if this_score <= previous_score:
                                    weights[weight_name] = prev_weight
                                else:
                                    previous_score = this_score

                        overall_improvement = this_score - score_baseline
                        score_baseline = this_score
                        score_progression.append(this_score)

                        # This is to export the graphs as weights are trained
                        ##                        drawWeights(self.exp,weights,query_type+"_weights_"+str(GLOBAL_FILE_COUNTER))
                        ##                        drawScoreProgression(self.exp,{self.exp["metric"]:score_progression},query_type+"_"+str(GLOBAL_FILE_COUNTER))
                        GLOBAL_FILE_COUNTER += 1

                        passes += 1

                    scores = self.measurePrecomputedResolution(
                        train_set, method, addExtraWeights(weights, self.exp), query_type
                    )
                    this_score = scores[0][self.exp["metric"]]

                    ##                if split_fold is not None:
                    ##                    split_set_str="_s"+str(split_fold)
                    ##                else:
                    ##                    split_set_str=""

                    ##                print "Weight inialization:",weight_initalization
                    improvement = (
                        100 * ((this_score - first_baseline) / float(first_baseline)) if first_baseline > 0 else 0
                    )
                    print(
                        "   Weights found, with score: {:.5f}".format(this_score),
                        " Improvement: {:.2f}%".format(improvement),
                    )
                    best_weights[query_type][method] = addExtraWeights(weights, self.exp)
                    print("   ", weights.values())

                    if self.exp.get("smooth_weights", None):
                        # this is to smooth a bit the weights in case they're too crazy
                        for weight in best_weights[query_type][method]:
                            amount = abs(min(1, best_weights[query_type][method][weight]) / float(3))
                            if best_weights[query_type][method][weight] > 1:
                                best_weights[query_type][method][weight] -= amount
                            elif best_weights[query_type][method][weight] < 1:
                                best_weights[query_type][method][weight] += amount

                    res[weight_initalization] = this_score

                results_compare.append(res)

        ##        better=0
        ##        diff=0
        ##    for res in results_compare:
        ##        if res["random"] > res[1]:
        ##            better+=1
        ##        diff+=res[1]-res["random"]

        ##    print "Random inialization better than dynamic setting",better,"times"
        ##    print "Avg difference between methods:",diff/float(len(results_compare))
        for init_method in initialization_methods:
            if len(results_compare) > 0:
                avg = sum([res[init_method] for res in results_compare]) / float(len(results_compare))
            else:
                avg = 0
            print("Avg for ", init_method, ":", avg)
        ##        if split_set is not None:
        ##            split_set_str="_s"+str(split_set)
        ##        else:
        ##            split_set_str=""
        ##        filename=getSafeFilename(self.exp["exp_dir"]+"weights_"+query_type+"_"+str(counter.getPossibleValues())+split_set_str+filename_add+".csv")
        ##        data.to_csv(filename)

        return best_weights