def trainWeights(self): """ Run the final stage of the weight training pipeline. """ gc.collect() options = self.options self.all_doc_methods = getDictOfTestingMethods(self.exp["doc_methods"]) best_weights = {} if options.get("override_folds", None): self.exp["cross_validation_folds"] = options["override_folds"] if options.get("override_metric", None): self.exp["metric"] = options["override_metric"] numfolds = self.exp.get("cross_validation_folds", 2) # First we find the highest weights for each fold's training set for split_fold in range(numfolds): print("\nFold #" + str(split_fold)) best_weights[split_fold] = self.dynamicWeightValues(split_fold) gc.collect() # Then we actually test them against the print("Now applying and testing weights...\n") self.measureScoresOfWeights(best_weights)
def trainKeywords(self): """ Run the final stage of the training pipeline """ gc.collect() options=self.options self.all_doc_methods=getDictOfTestingMethods(self.exp["doc_methods"]) best_keywords={} if options.get("override_folds",None): self.exp["cross_validation_folds"]=options["override_folds"] if options.get("override_metric",None): self.exp["metric"]=options["override_metric"] numfolds=self.exp.get("cross_validation_folds",2) # First we train a keyword extractor from each fold's training set for split_fold in range(numfolds): print("\nFold #"+str(split_fold)) trained_extractors[split_fold]=self.trainExtractor(split_fold) gc.collect() # Then we actually test them against the print("Now applying and testing keywords...\n") self.measureScoresOfKeywords(best_keywords)
def trainExtractor(self, split_fold): """ Train an extractor for the given fold """ all_doc_methods=getDictOfTestingMethods(self.exp["doc_methods"]) annotated_boost_methods=[x for x in all_doc_methods if all_doc_methods[x]["type"] in ["annotated_boost"]] numfolds=self.exp.get("cross_validation_folds",2) retrieval_results=self.loadPrecomputedFormulas() if len(retrieval_results) == 0: print("No precomputed formulas for ",) continue if len(retrieval_results) < numfolds: print("Number of results is smaller than number of folds for zone type ", query_type) continue cv = cross_validation.KFold(len(retrieval_results), n_folds=numfolds, shuffle=False, random_state=None) # indices=True, k=None cv=[k for k in cv] traincv, testcv=cv[split_fold] if isinstance(retrieval_results, ResultIncrementalReader): train_set=retrieval_results.subset(traincv) elif isinstance(retrieval_results, list): train_set=[retrieval_results[i] for i in traincv] else: raise ValueError("Unkown class of results") ## train_set=retrieval_results.subset(traincv) ## train_set=[retrieval_results[i] for i in traincv] if len(train_set) == 0: print("Training set len is 0!") return defaultdict(lambda:1) print("Training for %d/%d citations " % (len(train_set),len(retrieval_results))) trained_models={} for method in all_doc_methods: res={} # what to do with the runtime_parameters? ## all_doc_methods[method]["runtime_parameters"]=weights trained_models[method]=TFIDFKeywordExtractor() trained_models[method].train(train_set) return trained_models
def precomputeQueries(self,exp): """ Precompute all queries for all annotated citation contexts :param exp: experiment dict with all options :type exp: dict """ self.exp=exp print("Precomputing queries...") logger=ProgressIndicator(True, numitems=len(exp["test_files"])) # init all the logging/counting logger.numchunks=exp.get("numchunks",10) cp.Corpus.loadAnnotators() # convert nested dict to flat dict where each method includes its parameters in the name self.all_doc_methods=getDictOfTestingMethods(exp["doc_methods"]) self.precomputed_queries=[] self.files_dict=OrderedDict() ## if exp["full_corpus"]: ## files_dict["ALL_FILES"]={} ## files_dict["ALL_FILES"]["doc_methods"]=all_doc_methods ## files_dict["ALL_FILES"]["tfidf_models"]=[] ## for method in all_doc_methods: ## actual_dir=cp.Corpus.getRetrievalIndexPath("ALL_FILES",all_doc_methods[method]["index_filename"],exp["full_corpus"]) ## files_dict["ALL_FILES"]["tfidf_models"].append({"method":method,"actual_dir":actual_dir}) #=================================== # MAIN LOOP over all testing files #=================================== for guid in exp["test_files"]: try: self.processOneFile(guid) except ValueError: print("Can't load SciDoc ",guid) continue logger.showProgressReport(guid) # prints out info on how it's going self.saveAllQueries() print("Precomputed queries saved.")
def dynamicWeightValues(self, split_fold): """ Find the best combination of weights using a greedy heuristic, not testing every possible one, but selecting the best one at each stage """ all_doc_methods = getDictOfTestingMethods(self.exp["doc_methods"]) annotated_boost_methods = [x for x in all_doc_methods if all_doc_methods[x]["type"] in ["annotated_boost"]] initialization_methods = [1] ## initialization_methods=[1,"random"] MIN_WEIGHT = 0 ## self.exp["movements"]=[-1,3] self.exp["movements"] = [-1, 6, -2] best_weights = {} numfolds = self.exp.get("cross_validation_folds", 2) ## counter=weightCounterList(exp["weight_values"]) print("Processing zones ", self.exp["train_weights_for"]) for query_type in self.exp["train_weights_for"]: best_weights[query_type] = {} results_compare = [] retrieval_results = self.loadPrecomputedFormulas(query_type) if len(retrieval_results) == 0: print("No precomputed formulas for ", query_type) continue if len(retrieval_results) < numfolds: print("Number of results is smaller than number of folds for zone type ", query_type) continue cv = cross_validation.KFold( len(retrieval_results), n_folds=numfolds, shuffle=False, random_state=None ) # indices=True, k=None cv = [k for k in cv] traincv, testcv = cv[split_fold] if isinstance(retrieval_results, ResultIncrementalReader): train_set = retrieval_results.subset(traincv) elif isinstance(retrieval_results, list): train_set = [retrieval_results[i] for i in traincv] else: raise ValueError("Unkown class of results") ## train_set=retrieval_results.subset(traincv) ## train_set=[retrieval_results[i] for i in traincv] if len(train_set) == 0: print("Training set len is 0!") return defaultdict(lambda: 1) print("Training for citations in ", query_type, "zones:", len(train_set), "/", len(retrieval_results)) for method in annotated_boost_methods: res = {} for weight_initalization in initialization_methods: if weight_initalization == 1: ## counter.initWeights(all_doc_methods[method]["runtime_parameters"]) weights = {x: 1 for x in all_doc_methods[method]["runtime_parameters"]} elif weight_initalization == "random": weights = {x: random.randint(-10, 10) for x in all_doc_methods[method]["runtime_parameters"]} ## counter.weights={x:random.randint(-10,10) for x in all_doc_methods[method]["runtime_parameters"]} all_doc_methods[method]["runtime_parameters"] = weights print("Computing initial score...") scores = self.measurePrecomputedResolution( train_set, method, addExtraWeights(weights, self.exp), query_type ) score_baseline = scores[0][self.exp["metric"]] previous_score = score_baseline first_baseline = score_baseline score_progression = [score_baseline] global GLOBAL_FILE_COUNTER ## drawWeights(self.exp,weights,query_type+"_weights_"+str(GLOBAL_FILE_COUNTER)) ## drawScoreProgression(self.exp,score_progression,query_type+"_"+str(GLOBAL_FILE_COUNTER)) GLOBAL_FILE_COUNTER += 1 overall_improvement = score_baseline passes = 0 print("Finding best weights...") while passes < 3 or overall_improvement > 0: for direction in self.exp["movements"]: # [-1,6,-2] print("Direction: ", direction) for index in range(len(weights)): ## print("Weight: ", index) weight_name = weights.keys()[index] prev_weight = weights[weight_name] # hard lower limit of 0 for weights weights[weight_name] = max(MIN_WEIGHT, weights[weight_name] + direction) scores = self.measurePrecomputedResolution( train_set, method, addExtraWeights(weights, self.exp), query_type ) this_score = scores[0][self.exp["metric"]] if this_score <= previous_score: weights[weight_name] = prev_weight else: previous_score = this_score overall_improvement = this_score - score_baseline score_baseline = this_score score_progression.append(this_score) # This is to export the graphs as weights are trained ## drawWeights(self.exp,weights,query_type+"_weights_"+str(GLOBAL_FILE_COUNTER)) ## drawScoreProgression(self.exp,{self.exp["metric"]:score_progression},query_type+"_"+str(GLOBAL_FILE_COUNTER)) GLOBAL_FILE_COUNTER += 1 passes += 1 scores = self.measurePrecomputedResolution( train_set, method, addExtraWeights(weights, self.exp), query_type ) this_score = scores[0][self.exp["metric"]] ## if split_fold is not None: ## split_set_str="_s"+str(split_fold) ## else: ## split_set_str="" ## print "Weight inialization:",weight_initalization improvement = ( 100 * ((this_score - first_baseline) / float(first_baseline)) if first_baseline > 0 else 0 ) print( " Weights found, with score: {:.5f}".format(this_score), " Improvement: {:.2f}%".format(improvement), ) best_weights[query_type][method] = addExtraWeights(weights, self.exp) print(" ", weights.values()) if self.exp.get("smooth_weights", None): # this is to smooth a bit the weights in case they're too crazy for weight in best_weights[query_type][method]: amount = abs(min(1, best_weights[query_type][method][weight]) / float(3)) if best_weights[query_type][method][weight] > 1: best_weights[query_type][method][weight] -= amount elif best_weights[query_type][method][weight] < 1: best_weights[query_type][method][weight] += amount res[weight_initalization] = this_score results_compare.append(res) ## better=0 ## diff=0 ## for res in results_compare: ## if res["random"] > res[1]: ## better+=1 ## diff+=res[1]-res["random"] ## print "Random inialization better than dynamic setting",better,"times" ## print "Avg difference between methods:",diff/float(len(results_compare)) for init_method in initialization_methods: if len(results_compare) > 0: avg = sum([res[init_method] for res in results_compare]) / float(len(results_compare)) else: avg = 0 print("Avg for ", init_method, ":", avg) ## if split_set is not None: ## split_set_str="_s"+str(split_set) ## else: ## split_set_str="" ## filename=getSafeFilename(self.exp["exp_dir"]+"weights_"+query_type+"_"+str(counter.getPossibleValues())+split_set_str+filename_add+".csv") ## data.to_csv(filename) return best_weights