Exemple #1
0
    def __build(self, query_docs_subset):

        if query_docs_subset is None:
            # number of samples
            return  #

        self.sub_set_goldstandard = {}
        self.collection = {}

        # filter the goldstandard
        for _id, relevance in query_docs_subset.items():

            if _id not in self.goldstandard:
                self.skipped_queries.append(_id)
                continue

            # do not use queries without true positives
            # this add an overhead that can be avoided by refactor the follwing for loop!
            unique_relevants = set(
                sum([
                    self.goldstandard[_id][k]
                    for k in self.goldstandard[_id].keys() if k > 0
                ], []))
            if all([doc["id"] not in unique_relevants for doc in relevance]):
                self.skipped_queries.append(_id)
                continue

            self.sub_set_goldstandard[_id] = defaultdict(list)

            for doc in relevance:
                k = self.__find_relevance_group(doc["id"],
                                                self.goldstandard[_id])
                if k > 0:
                    if self.use_relevance_groups:
                        self.sub_set_goldstandard[_id][k].append(doc["id"])
                    else:
                        self.sub_set_goldstandard[_id][1].append(doc["id"])
                else:
                    # default add to the less relevance group
                    self.sub_set_goldstandard[_id][0].append(doc["id"])

                #add to the collection
                self.collection[doc["id"]] = doc["text"]

        # remove the skipped queries from the data
        index_to_remove = []

        for skipped in self.skipped_queries:
            _index = index_from_list(self.query_list,
                                     lambda x: x["id"] == skipped)
            if _index > -1:
                index_to_remove.append(_index)
        index_to_remove.sort(key=lambda x: -x)

        # start removing from the tail
        for _index in index_to_remove:
            del self.query_list[_index]

        # stats
        if self.verbose:
            max_keys = max(
                map(lambda x: max(x.keys()),
                    self.sub_set_goldstandard.values()))

            for k in range(max_keys + 1):
                print(
                    "Minimum number of relevance type({}) in the queries of the goldstandard sub set: {}"
                    .format(
                        k,
                        min(
                            map(lambda x: len(x[k]),
                                self.sub_set_goldstandard.values()))))

                print(
                    "Mean number of relevance type({}) in the queries of the goldstandard sub set: {}"
                    .format(
                        k,
                        sum(
                            map(lambda x: len(x[k]),
                                self.sub_set_goldstandard.values())) /
                        len(self.sub_set_goldstandard)))

            print("Sub Collection size", len(self.collection))
            print("Number of skipped question, due to lack of true positives",
                  len(self.skipped_queries))
    def __build(self, query_docs_subset):
        
        if query_docs_subset is None:
            return 
        
        self.sub_set_goldstandard = {}
        self.collection = {}
        progress = 0
        # filter the goldstandard
        for _id, relevance in query_docs_subset.items():
            print("running query:", progress, end="\r")
            progress+=1
            if _id not in self.goldstandard:
                self.skipped_queries.append(_id)
                continue
            
            # do not use queries without true positives
            # this add an overhead that can be avoided by refactor the follwing for loop!
            unique_relevants = set(map(lambda x:x["id"], self.goldstandard[_id]))
            if all([ doc["id"] not in unique_relevants for doc in relevance ]):
                self.skipped_queries.append(_id)
                continue
            
            self.sub_set_goldstandard[_id] = defaultdict(list)
            
            for doc in relevance: # for each document that was retrieved
                
                # Splitting and saving the document
                doc_to_sentence = []

                for _itter, position in enumerate(PunktSentenceTokenizer().span_tokenize(doc["text"])):
                    start, end = position 
                    if _itter>0: # fix the start and end position for the abstract
                        start = start-len(doc["title"])-1
                        end = end-len(doc["title"])
                        
                    _doc = {"text":doc["text"][start:end],
                            "start":start,
                            "end":end}
                    
                    doc_to_sentence.append(_doc)
                
                self.collection[doc["id"]] = doc_to_sentence
                
                # goldstandard should store the doc_id and the index of the positive snippets
                if doc["id"] in unique_relevants:
                    _doc_snippets = {
                        "id": doc["id"],
                        "score":doc["score"],
                        "snippet_index": self.find_relevant_snippets(doc_to_sentence, self.goldstandard[_id], len(doc["title"]), doc["id"])
                    }
                    self.sub_set_goldstandard[_id][1].append(_doc_snippets)
                else:
                    _doc_snippets = {
                        "id": doc["id"],
                        "score":doc["score"],
                        "snippet_index": [ 0 for _ in range(len(doc_to_sentence))] # empty relevance
                    }
                    self.sub_set_goldstandard[_id][0].append(_doc_snippets)
                
        # remove the skipped queries from the data
        index_to_remove = []
        
        for skipped in self.skipped_queries:
            _index = index_from_list(self.query_list, lambda x: x["id"]==skipped)
            if _index>-1:
                index_to_remove.append(_index)
        index_to_remove.sort(key=lambda x:-x)
        
        # start removing from the tail
        for _index in index_to_remove:
            del self.query_list[_index]
        
        # stats
        if self.verbose:
            max_keys = max(map(lambda x:max(x.keys()), self.sub_set_goldstandard.values()))
            
            for k in range(max_keys+1):
                print("Minimum number of relevance type({}) in the queries of the goldstandard sub set: {}".format(k, min(map(lambda x: len(x[k]), self.sub_set_goldstandard.values()))))
            
                print("Mean number of relevance type({}) in the queries of the goldstandard sub set: {}".format(k, sum(map(lambda x: len(x[k]), self.sub_set_goldstandard.values()))/len(self.sub_set_goldstandard)))
            
            print("Sub Collection size", len(self.collection))
            print("Number of skipped question, due to lack of true positives", len(self.skipped_queries))