def __build(self, query_docs_subset): if query_docs_subset is None: # number of samples return # self.sub_set_goldstandard = {} self.collection = {} # filter the goldstandard for _id, relevance in query_docs_subset.items(): if _id not in self.goldstandard: self.skipped_queries.append(_id) continue # do not use queries without true positives # this add an overhead that can be avoided by refactor the follwing for loop! unique_relevants = set( sum([ self.goldstandard[_id][k] for k in self.goldstandard[_id].keys() if k > 0 ], [])) if all([doc["id"] not in unique_relevants for doc in relevance]): self.skipped_queries.append(_id) continue self.sub_set_goldstandard[_id] = defaultdict(list) for doc in relevance: k = self.__find_relevance_group(doc["id"], self.goldstandard[_id]) if k > 0: if self.use_relevance_groups: self.sub_set_goldstandard[_id][k].append(doc["id"]) else: self.sub_set_goldstandard[_id][1].append(doc["id"]) else: # default add to the less relevance group self.sub_set_goldstandard[_id][0].append(doc["id"]) #add to the collection self.collection[doc["id"]] = doc["text"] # remove the skipped queries from the data index_to_remove = [] for skipped in self.skipped_queries: _index = index_from_list(self.query_list, lambda x: x["id"] == skipped) if _index > -1: index_to_remove.append(_index) index_to_remove.sort(key=lambda x: -x) # start removing from the tail for _index in index_to_remove: del self.query_list[_index] # stats if self.verbose: max_keys = max( map(lambda x: max(x.keys()), self.sub_set_goldstandard.values())) for k in range(max_keys + 1): print( "Minimum number of relevance type({}) in the queries of the goldstandard sub set: {}" .format( k, min( map(lambda x: len(x[k]), self.sub_set_goldstandard.values())))) print( "Mean number of relevance type({}) in the queries of the goldstandard sub set: {}" .format( k, sum( map(lambda x: len(x[k]), self.sub_set_goldstandard.values())) / len(self.sub_set_goldstandard))) print("Sub Collection size", len(self.collection)) print("Number of skipped question, due to lack of true positives", len(self.skipped_queries))
def __build(self, query_docs_subset): if query_docs_subset is None: return self.sub_set_goldstandard = {} self.collection = {} progress = 0 # filter the goldstandard for _id, relevance in query_docs_subset.items(): print("running query:", progress, end="\r") progress+=1 if _id not in self.goldstandard: self.skipped_queries.append(_id) continue # do not use queries without true positives # this add an overhead that can be avoided by refactor the follwing for loop! unique_relevants = set(map(lambda x:x["id"], self.goldstandard[_id])) if all([ doc["id"] not in unique_relevants for doc in relevance ]): self.skipped_queries.append(_id) continue self.sub_set_goldstandard[_id] = defaultdict(list) for doc in relevance: # for each document that was retrieved # Splitting and saving the document doc_to_sentence = [] for _itter, position in enumerate(PunktSentenceTokenizer().span_tokenize(doc["text"])): start, end = position if _itter>0: # fix the start and end position for the abstract start = start-len(doc["title"])-1 end = end-len(doc["title"]) _doc = {"text":doc["text"][start:end], "start":start, "end":end} doc_to_sentence.append(_doc) self.collection[doc["id"]] = doc_to_sentence # goldstandard should store the doc_id and the index of the positive snippets if doc["id"] in unique_relevants: _doc_snippets = { "id": doc["id"], "score":doc["score"], "snippet_index": self.find_relevant_snippets(doc_to_sentence, self.goldstandard[_id], len(doc["title"]), doc["id"]) } self.sub_set_goldstandard[_id][1].append(_doc_snippets) else: _doc_snippets = { "id": doc["id"], "score":doc["score"], "snippet_index": [ 0 for _ in range(len(doc_to_sentence))] # empty relevance } self.sub_set_goldstandard[_id][0].append(_doc_snippets) # remove the skipped queries from the data index_to_remove = [] for skipped in self.skipped_queries: _index = index_from_list(self.query_list, lambda x: x["id"]==skipped) if _index>-1: index_to_remove.append(_index) index_to_remove.sort(key=lambda x:-x) # start removing from the tail for _index in index_to_remove: del self.query_list[_index] # stats if self.verbose: max_keys = max(map(lambda x:max(x.keys()), self.sub_set_goldstandard.values())) for k in range(max_keys+1): print("Minimum number of relevance type({}) in the queries of the goldstandard sub set: {}".format(k, min(map(lambda x: len(x[k]), self.sub_set_goldstandard.values())))) print("Mean number of relevance type({}) in the queries of the goldstandard sub set: {}".format(k, sum(map(lambda x: len(x[k]), self.sub_set_goldstandard.values()))/len(self.sub_set_goldstandard))) print("Sub Collection size", len(self.collection)) print("Number of skipped question, due to lack of true positives", len(self.skipped_queries))