Beispiel #1
0
class qrels:
    """
    A class to create the qrels of the questions for SQUAD dataset.
    """
    def __init__(self, index_dir):
        self.searchObject = Searcher(index_dir)

    def get_id_section(self, pair):
        """
        This is the function that returns the id of the passage that is similar to the context.
        :param index_dir: the folder where the dataset index is stored.
        :param input_query: the query that represents the context.
        :return:
        """
        result = self.searchObject.pairSearch(pair, BM25Similarity())
        id = None
        for i in range(len(result)):
            hitDoc = self.searchObject.searcher.doc(result[i].doc)
            id = hitDoc.get("id_section")
            content = hitDoc.get("content_section")
            if id != "":
                break
        return id, content

    def process(self, input_file, output_dir):
        """
        This is the main function that creates the qrels file.
        :param input_file: the file to process.
        :param index_dir: the folder where the dataset index is stored.
        :param output_dir: the folder where to store the qrels file.
        :return:
        """
        num_lines = 0
        with open(input_file, encoding="utf-8") as json_file:
            data = json.load(json_file)
            for p in data['data']:
                for par in p['paragraphs']:
                    num_lines += 1

        output_file = open(output_dir + "/qrels.txt", 'a+', encoding="utf-8")
        with tqdm(total=num_lines) as pbar:
            with open(input_file, encoding="utf-8") as json_file:
                data = json.load(json_file)
                for p in data['data']:
                    for par in p['paragraphs']:
                        pbar.update(1)
                        psg_id, content = self.get_id_section(
                            (p['title'], par["context"]))
                        # print("Content: "+content+"\n")
                        #similarity = round(len(set(par["context"]) & set(content)) / len(set(par["context"])), 2)
                        for q in par["qas"]:
                            qst_id = q["id"]
                            if q["is_impossible"] is False:
                                output_file.write(qst_id + " 0 " + psg_id +
                                                  " 1 \n")
                print("==> Qrels successfully created.\n")
class ResultsGenerator:
    def __init__(self, index_dir):
        self.searcher = Searcher(index_dir)

    def get_id_section(self, request):
        idList = list()
        for i in range(len(request)):
            hitDoc = self.searcher.searcher.doc(request[i].doc)
            idList.append(hitDoc.get("id_section"))
        return idList

    def process(self, input_file, index_dir, output_dir):
        output_file_1 = open(output_dir + "/results_BM25_1.txt",
                             'a+',
                             encoding="utf-8")
        output_file_2 = open(output_dir + "/results_BM25_2.txt",
                             'a+',
                             encoding="utf-8")
        output_file_3 = open(output_dir + "/results_BM25_3.txt",
                             'a+',
                             encoding="utf-8")
        output_file_4 = open(output_dir + "/results_BM25_4.txt",
                             'a+',
                             encoding="utf-8")
        output_file_5 = open(output_dir + "/results_VSM_1.txt",
                             'a+',
                             encoding="utf-8")
        output_file_6 = open(output_dir + "/results_VSM_2.txt",
                             'a+',
                             encoding="utf-8")
        output_file_7 = open(output_dir + "/results_VSM_3.txt",
                             'a+',
                             encoding="utf-8")
        output_file_8 = open(output_dir + "/results_VSM_4.txt",
                             'a+',
                             encoding="utf-8")

        num_lines = 0
        with open(input_file, encoding="utf-8") as json_file:
            data = json.load(json_file)
            for p in data['data']:
                for par in p['paragraphs']:
                    for q in par["qas"]:
                        num_lines += 1

        with tqdm(total=num_lines) as pbar:
            with open(input_file, encoding="utf-8") as json_file:
                data = json.load(json_file)
                for p in data['data']:
                    title = p["title"]
                    for par in p['paragraphs']:
                        for q in par["qas"]:
                            pbar.update(1)
                            if q["is_impossible"] is False:
                                question_content_s_BM25 = self.searcher.simpleSearch(
                                    q["question"], BM25Similarity())
                                id_question_content_s_BM25 = self.get_id_section(
                                    question_content_s_BM25)

                                question_title_content_s_BM25 = self.searcher.pairSearch(
                                    [title, q["question"]], BM25Similarity())
                                id_question_title_content_s_BM25 = self.get_id_section(
                                    question_title_content_s_BM25)

                                question_content_m_BM25 = self.searcher.multiFieldsSearch(
                                    q["question"], BM25Similarity())
                                id_question_content_m_BM25 = self.get_id_section(
                                    question_content_m_BM25)

                                question_title_content_m_BM25 = self.searcher.multiFieldsPairSearch(
                                    [title, q["question"]], BM25Similarity())
                                id_question_title_content_m_BM25 = self.get_id_section(
                                    question_title_content_m_BM25)

                                question_content_s_TDF = self.searcher.simpleSearch(
                                    q["question"], ClassicSimilarity())
                                id_question_content_s_TDF = self.get_id_section(
                                    question_content_s_TDF)

                                question_title_content_s_TDF = self.searcher.pairSearch(
                                    [title, q["question"]],
                                    ClassicSimilarity())
                                id_question_title_content_s_TDF = self.get_id_section(
                                    question_title_content_s_TDF)

                                question_content_m_TDF = self.searcher.multiFieldsSearch(
                                    q["question"], ClassicSimilarity())
                                id_question_content_m_TDF = self.get_id_section(
                                    question_content_m_TDF)

                                question_title_content_m_TDF = self.searcher.multiFieldsPairSearch(
                                    [title, q["question"]],
                                    ClassicSimilarity())
                                id_question_title_content_m_TDF = self.get_id_section(
                                    question_title_content_m_TDF)

                                for i in range(len(question_content_s_BM25)):
                                    output_file_1.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_s_BM25[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_s_BM25[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_s_BM25)):
                                    output_file_2.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_s_BM25[i]
                                            ) + " " + str(i + 1) + " " +
                                        str(question_title_content_s_BM25[i].
                                            score) + " STANDARD\n")
                                for i in range(len(question_content_m_BM25)):
                                    output_file_3.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_m_BM25[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_m_BM25[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_m_BM25)):
                                    output_file_4.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_m_BM25[i]
                                            ) + " " + str(i + 1) + " " +
                                        str(question_title_content_m_BM25[i].
                                            score) + " STANDARD\n")
                                for i in range(len(question_content_s_TDF)):
                                    output_file_5.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_s_TDF[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_s_TDF[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_s_TDF)):
                                    output_file_6.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_s_TDF[i])
                                        + " " + str(i + 1) + " " +
                                        str(question_title_content_s_TDF[i].
                                            score) + " STANDARD\n")
                                for i in range(len(question_content_m_TDF)):
                                    output_file_7.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_m_TDF[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_m_TDF[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_m_TDF)):
                                    output_file_8.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_m_TDF[i])
                                        + " " + str(i + 1) + " " +
                                        str(question_title_content_m_TDF[i].
                                            score) + " STANDARD\n")

        print("==> Results successfully created.\n")