コード例 #1
0
    def __init__(self):
        # fn_docs = 'mycorpus.txt'

        self.prior_case_dir = 'training_data/Prior_Cases/'
        self.current_case_directory = "training_data/Current_Cases/"
        self.qg = QueryGenerator()
        self.preprocessor = Preprocessor()
        # self.wordnet = WordnetSimilarityEvaluator()
        self.idf_score_evaluator = IDFScore(self.current_case_directory,
                                            self.prior_case_dir)
        """
        with multiprocessing.Manager() as m:
            self.synset_contents = m.dict()
            self.synset_generator(self.prior_case_dir)

            d = {k: dict(v) if isinstance(v, DictProxy) else v
                 for k, v in self.synset_contents.items()}
            self.synset_contents = copy.deepcopy(d)
        """
        # manager = multiprocessing.Manager()
        self.synset_contents = dict()
        self.synset_generator(self.prior_case_dir)
        self.citation_similarity_scorer = CitationSimilarity()
        self.citation_similarity_scorer.synset_contents = self.synset_contents
        self.evaluator = IREvaluator('training_data/qrel.json')
        self.doc_wise_results = dict()
        self.doc_wise_ranking = dict()
        self.dir = ""
        self.results = dict()
コード例 #2
0
 def __init__(self):
     # fn_docs = 'mycorpus.txt'
     self.prior_case_dir = 'training_data/Prior_Cases/'
     self.current_case_directory = "training_data/Current_Cases/"
     self.bm25 = BM25(self.prior_case_dir, delimiter=' ')
     self.qg = QueryGenerator()
     self.idf_score_evaluator = IDFScore(self.current_case_directory,
                                         self.prior_case_dir)
     self.evaluator = IREvaluator('training_data/qrel.json')
     self.doc_wise_results = dict()
     self.doc_wise_ranking = dict()
     self.results = dict()
コード例 #3
0
    def __init__(self):
        # fn_docs = 'mycorpus.txt'

        self.prior_case_dir = 'training_data/Prior_Cases/'
        self.current_case_directory = "training_data/Current_Cases/"
        self.qg = QueryGenerator()
        self.preprocessor = Preprocessor()
        self.wordnet = WordnetSimilarityEvaluator()
        self.idf_score_evaluator = IDFScore(self.current_case_directory, self.prior_case_dir)
        self.sysnet_contents = dict()
        self.sysnet_generator(self.prior_case_dir)
        self.evaluator = IREvaluator('training_data/qrel.json')
        self.doc_wise_results = dict()
        self.doc_wise_ranking = dict()
        self.results = dict()
コード例 #4
0
class WordnetRunnerMultiprocessSerial:
    def __init__(self):
        # fn_docs = 'mycorpus.txt'

        self.prior_case_dir = 'training_data/Prior_Cases/'
        self.current_case_directory = "training_data/Current_Cases/"
        self.qg = QueryGenerator()
        self.preprocessor = Preprocessor()
        # self.wordnet = WordnetSimilarityEvaluator()
        self.idf_score_evaluator = IDFScore(self.current_case_directory,
                                            self.prior_case_dir)
        """
        with multiprocessing.Manager() as m:
            self.synset_contents = m.dict()
            self.synset_generator(self.prior_case_dir)

            d = {k: dict(v) if isinstance(v, DictProxy) else v
                 for k, v in self.synset_contents.items()}
            self.synset_contents = copy.deepcopy(d)
        """
        # manager = multiprocessing.Manager()
        self.synset_contents = dict()
        self.synset_generator(self.prior_case_dir)
        self.citation_similarity_scorer = CitationSimilarity()
        self.citation_similarity_scorer.synset_contents = self.synset_contents
        self.evaluator = IREvaluator('training_data/qrel.json')
        self.doc_wise_results = dict()
        self.doc_wise_ranking = dict()
        self.dir = ""
        self.results = dict()

    """
    def convert_file_to_synset(self, filename):
        with open(os.path.join(self.dir, str(filename)), 'r') as f:
            # content = self.preprocessor.preprocess(f.read().lower())

            content = f.read().lower()
            print("Converting to synset contents of file : " + str(filename) + "  ; pid : " + str(os.getpid()))
            self.synset_contents[filename] = self.wordnet.doc_to_synsets(content)
            # print("Converted to synset contents of file : " + str(filename) + "  ; pid : " + str(os.getpid()))
            # return self.synset_contents[filename]

            # return list(self.wordnet.doc_to_synsets(content))
    """

    def synset_generator(self, directory):
        file_counter = 1
        self.dir = directory
        file_list = []
        for file in sorted(os.listdir(directory),
                           key=lambda item:
                           (int(item.partition('_')[2])
                            if item[0].isdigit() else float('inf'), item)):
            file_counter = file_counter + 1
            filename = os.fsdecode(file)
            if filename.endswith(".txt"):
                # print(os.path.join(directory), str(filename))
                # file_list.append(filename)
                with open(os.path.join(self.dir, str(filename)), 'r') as f:
                    content = self.preprocessor.preprocess(f.read().lower())

                    # content = f.read().lower()
                    print("Adding to dict contents of file : " +
                          str(filename) + "  ; pid : " + str(os.getpid()))
                    # self.synset_contents[filename] = self.wordnet.doc_to_synsets(content)
                    self.synset_contents[filename] = content
                    # print("Converted to synset contents of file : " + str(filename) + "  ; pid : " + str(os.getpid()))

        # print(self.synset_contents)
        """
        p = multiprocessing.Pool()
        # map list to target function
        # synset_list = p.map(self.convert_file_to_synset, file_list)
        p.map(self.convert_file_to_synset, file_list)
        p.close()
        p.join()
        p.clear()
        """

        # for i in range(len(file_list)):
        #     self.synset_contents[file_list[i]] = copy.deepcopy(synset_list[i])

        # print(self.synset_contents)

    def iter_as_list(self, n, iterable):
        "Return first n items of the iterable as a list"
        return list(islice(iterable, n))

    def convert_to_dict(self, scores):
        n = 1
        score_dict = dict()
        for score in scores:
            score_dict['prior_case_' +
                       str('0' * (4 - len(str(n))) + str(n) + '.txt')] = score
            n = n + 1

        return score_dict

    def sort_by_document(self, results):

        for prior_case in results:
            doc_score = dict()
            for citation in results[prior_case]:
                for doc, score in results[prior_case][citation].items():
                    if doc not in doc_score.keys():
                        doc_score[doc] = score
                    else:
                        if score > doc_score[doc]:
                            doc_score[doc] = score
            doc_ranking = [
                k for k in sorted(doc_score, key=doc_score.get, reverse=True)
            ]
            doc_score = [
                (k, doc_score[k])
                for k in sorted(doc_score, key=doc_score.get, reverse=True)
            ]

            # doc_score = OrderedDict(sorted(doc_score.items(), key=itemgetter(1), reverse=True))
            self.doc_wise_results[prior_case] = doc_score
            self.doc_wise_ranking[prior_case] = doc_ranking

        return self.doc_wise_results, self.doc_wise_ranking

    def query_wordnet_similarity_ranking(self, query, log_str):
        """
        scores = []
        counter = 0
        for file in sorted(self.synset_contents.keys(),
            key=lambda item: (int(item.partition('_')[2])
            if item[0].isdigit() else float('inf'), item)):
            scores.append(self.wordnet.sysnset_path_similarity(self.wordnet.doc_to_synsets(query), self.synset_contents[file]))
            print("Comparison no. : " + str(counter))
            sys.stdout.flush()
            counter = counter + 1

        return scores
        """
        return self.citation_similarity_scorer.citation_similarity_scores(
            query, log_str)

    # def dummy(self, current_case):
    #     with open(os.path.join(self.current_case_directory, str(current_case)), 'r') as f:
    #         content = f.read()
    #         surrounding_text_blocks = self.qg.generate_query(content)
    #         citation_counter = 0
    #         for surrounding_text in surrounding_text_blocks:
    #             citation_counter = citation_counter + 1
    #             preprocessed_text = self.preprocessor.preprocess(surrounding_text)
    #             idf_score = self.idf_score_evaluator.get_idf_score(preprocessed_text)
    #             # take only top 50% here
    #             selected_words_idf_scores = self.iter_as_list(len(idf_score) // 2, idf_score.items())
    #             selected_words = " ".join([i[0] for i in selected_words_idf_scores])
    #             # print(selected_words)
    #             print("Current Case : " + str(current_case) + "  ;  Citation counter : " + str(citation_counter))
    #             sys.stdout.flush()
    #             wordnet_scores = self.query_wordnet_similarity_ranking(selected_words)
    #             wordnet_score_dict = self.convert_to_dict(wordnet_scores)
    #     return current_case
    """

    def predict_prior_cases_for_current_case(self, current_case):
        # manager = self.m
        results = dict()
        with open(os.path.join(self.current_case_directory, str(current_case)), 'r') as f:
            content = f.read()
            surrounding_text_blocks = self.qg.generate_query(content)
            citation_counter = 0
            for surrounding_text in surrounding_text_blocks:
                preprocessed_text = self.preprocessor.preprocess(surrounding_text)
                idf_score = self.idf_score_evaluator.get_idf_score(preprocessed_text)
                # take only top 50% here
                selected_words_idf_scores = self.iter_as_list(len(idf_score) // 2, idf_score.items())
                selected_words = " ".join([i[0] for i in selected_words_idf_scores])
                # print(selected_words)
                log_str = "Current Case : " + str(current_case) + "  ;  Citation counter : " + str(citation_counter) \
                          + " ; "
                print(log_str)
                sys.stdout.flush()
                wordnet_scores = self.query_wordnet_similarity_ranking(selected_words, log_str)
                wordnet_score_dict = self.convert_to_dict(wordnet_scores)

                if current_case not in results.keys():
                    results[current_case] = dict()

                results[current_case][citation_counter] = copy.deepcopy(wordnet_score_dict)
                citation_counter = citation_counter + 1
        # print("results : " + str(results))


        return results
    """
    """
    def predict_prior_cases_for_citation(self, surrounding_text, citation_counter):
        print("Citation counter : " + str(citation_counter) + "  ; start")
        # print("1")
        sys.stdout.flush()
        preprocessed_text = self.preprocessor.preprocess(surrounding_text)
        idf_score = self.idf_score_evaluator.get_idf_score(preprocessed_text)
        # take only top 50% here
        selected_words_idf_scores = self.iter_as_list(len(idf_score) // 2, idf_score.items())
        selected_words = " ".join([i[0] for i in selected_words_idf_scores])
        # print(selected_words)
        wordnet_scores = self.query_wordnet_similarity_ranking(selected_words)
        wordnet_score_dict = self.convert_to_dict(wordnet_scores)

        print("Citation counter : " + str(citation_counter) + "  ; end")

        return wordnet_score_dict
    """

    def execute_baseline(self):
        results = dict()
        # os.listdir(self.current_case_directory)
        for file in sorted([
                'current_case_0001.txt', 'current_case_0030.txt',
                'current_case_0034.txt', 'current_case_0068.txt',
                'current_case_0070.txt', 'current_case_0110.txt',
                'current_case_0133.txt', 'current_case_0163.txt'
        ],
                           key=lambda item:
                           (int(item.partition('_')[2])
                            if item[0].isdigit() else float('inf'), item)):
            filename = os.fsdecode(file)
            if filename.endswith(".txt"):
                print("Current case : " + filename)
                with open(
                        os.path.join(self.current_case_directory,
                                     str(filename)), 'r') as f:
                    content = f.read()
                    surrounding_text_blocks = self.qg.generate_query(content)

                    # surrounding_text_pairs = []

                    # for surrounding_text_block in surrounding_text_blocks:
                    #     surrounding_text_pairs.append((surrounding_text_block, counter))

                    citation_counter = 0
                    for surrounding_text in surrounding_text_blocks:
                        preprocessed_text = self.preprocessor.preprocess(
                            surrounding_text)
                        idf_score = self.idf_score_evaluator.get_idf_score(
                            preprocessed_text)
                        # take only top 50% here
                        selected_words_idf_scores = self.iter_as_list(
                            len(idf_score) // 2, idf_score.items())
                        selected_words = " ".join(
                            [i[0] for i in selected_words_idf_scores])
                        # print(selected_words)
                        log_str = "Current Case : " + str(
                            filename) + "  ;  Citation counter : " + str(
                                citation_counter) + " ; "
                        print(log_str)
                        sys.stdout.flush()
                        wordnet_scores = self.query_wordnet_similarity_ranking(
                            selected_words, log_str)
                        wordnet_score_dict = self.convert_to_dict(
                            wordnet_scores)

                        if filename not in results.keys():
                            results[filename] = dict()

                        results[filename][citation_counter] = copy.deepcopy(
                            wordnet_score_dict)
                        citation_counter = citation_counter + 1
                """
                p = pathos.multiprocessing.ProcessingPool()
                # p = multiprocessing.Pool()
                # current_case_results = p.map(self.predict_prior_cases_for_citation, surrounding_text_blocks)
                current_case_results = p.map(self.predict_prior_cases_for_citation, surrounding_text_blocks,
                                             [i for i in range(len(surrounding_text_blocks))])
                # print(results)
                p.close()
                p.join()
                if filename not in results.keys():
                    results[filename] = dict()

                citation_counter = 0
                for citation_result in current_case_results:
                    results[filename][citation_counter] = copy.deepcopy(citation_result)
                    citation_counter = citation_counter + 1
                """

        self.results = copy.deepcopy(results)
        """
        citation_counter = 0
        for surrounding_text in surrounding_text_blocks:
            preprocessed_text = self.preprocessor.preprocess(surrounding_text)
            idf_score = self.idf_score_evaluator.get_idf_score(preprocessed_text)
            # take only top 50% here
            selected_words_idf_scores = self.iter_as_list(len(idf_score) // 2, idf_score.items())
            selected_words = " ".join([i[0] for i in selected_words_idf_scores])
            # print(selected_words)
            print(
                "Current Case : " + str(filename) + "  ;  Citation counter : " + str(citation_counter))
            sys.stdout.flush()
            wordnet_scores = self.query_wordnet_similarity_ranking(selected_words)
            wordnet_score_dict = self.convert_to_dict(wordnet_scores)

            if filename not in results.keys():
                results[filename] = dict()

            results[filename][citation_counter] = copy.deepcopy(wordnet_score_dict)
            citation_counter = citation_counter + 1
        """

    def get_eval_scores(self, results):
        doc_wise_results, doc_wise_ranking = self.sort_by_document(results)
        # print(doc_wise_results)
        print(self.evaluator.get_doc_wise_results(doc_wise_ranking))
        print("___________________________________________")
        final_scores = self.evaluator.get_total_scores()

        # print(final_scores)

        return final_scores
コード例 #5
0
class WordnetRunner:

    def __init__(self):
        # fn_docs = 'mycorpus.txt'

        self.prior_case_dir = 'training_data/Prior_Cases/'
        self.current_case_directory = "training_data/Current_Cases/"
        self.qg = QueryGenerator()
        self.preprocessor = Preprocessor()
        self.wordnet = WordnetSimilarityEvaluator()
        self.idf_score_evaluator = IDFScore(self.current_case_directory, self.prior_case_dir)
        self.sysnet_contents = dict()
        self.sysnet_generator(self.prior_case_dir)
        self.evaluator = IREvaluator('training_data/qrel.json')
        self.doc_wise_results = dict()
        self.doc_wise_ranking = dict()
        self.results = dict()

    def sysnet_generator(self, directory):
        file_counter = 1
        for file in sorted(os.listdir(directory),
                               key=lambda item: (int(item.partition('_')[2])
                                    if item[0].isdigit() else float('inf'), item)):
            print("Converting to sysnet of file no. : " + str(file_counter))
            file_counter = file_counter + 1
            filename = os.fsdecode(file)
            if filename.endswith(".txt"):
                # print(os.path.join(directory), str(filename))
                with open(os.path.join(directory, str(filename)), 'r') as f:
                    # content = self.preprocessor.preprocess(f.read().lower())
                    content = f.read().lower()
                    self.sysnet_contents[filename] = self.wordnet.doc_to_synsets(content)


    def iter_as_list(self, n, iterable):
        "Return first n items of the iterable as a list"
        return list(islice(iterable, n))

    def convert_to_dict(self, scores):
        n = 1
        score_dict = dict()
        for score in scores:
            score_dict['prior_case_' + str('0'*(4-len(str(n))) + str(n) + '.txt')] = score
            n = n + 1

        return score_dict

    def sort_by_document(self, results):

        for prior_case in results:
            doc_score = dict()
            for citation in results[prior_case]:
                for doc, score in results[prior_case][citation].items():
                    if doc not in doc_score.keys():
                        doc_score[doc] = score
                    else:
                        if score > doc_score[doc]:
                            doc_score[doc] = score
            doc_ranking = [k for k in sorted(doc_score, key=doc_score.get, reverse=True)]
            doc_score = [(k, doc_score[k]) for k in sorted(doc_score, key=doc_score.get, reverse=True)]

            # doc_score = OrderedDict(sorted(doc_score.items(), key=itemgetter(1), reverse=True))
            self.doc_wise_results[prior_case] = doc_score
            self.doc_wise_ranking[prior_case] = doc_ranking

        return self.doc_wise_results, self.doc_wise_ranking

    def query_wordnet_similarity_ranking(self, query):
        scores = []
        counter = 0
        for file in sorted(self.sysnet_contents.keys(),
            key=lambda item: (int(item.partition('_')[2])
            if item[0].isdigit() else float('inf'), item)):
            scores.append(self.wordnet.sysnset_path_similarity(self.wordnet.doc_to_synsets(query), self.sysnet_contents[file]))
            print("Comparison no. : " + str(counter))
            sys.stdout.flush()
            counter = counter + 1

        return scores

    def execute_baseline(self):

        no_of_cases = 0
        for file in sorted(os.listdir(self.current_case_directory),
                           key=lambda item: (int(item.partition('_')[2])
                                             if item[0].isdigit() else float('inf'), item)):
            filename = os.fsdecode(file)
            if filename.endswith(".txt"):
                with open(os.path.join(self.current_case_directory, str(filename)), 'r') as f:
                    content = f.read()
                    surrounding_text_blocks = self.qg.generate_query(content)
                    citation_counter = 0
                    for surrounding_text in surrounding_text_blocks:
                        preprocessed_text = self.preprocessor.preprocess(surrounding_text)
                        idf_score = self.idf_score_evaluator.get_idf_score(preprocessed_text)
                        # take only top 50% here
                        selected_words_idf_scores = self.iter_as_list(len(idf_score) // 2, idf_score.items())
                        selected_words = " ".join([i[0] for i in selected_words_idf_scores])
                        # print(selected_words)
                        wordnet_scores = self.query_wordnet_similarity_ranking(selected_words)
                        wordnet_score_dict = self.convert_to_dict(wordnet_scores)

                        if filename not in self.results.keys():
                            self.results[filename] = dict()

                            self.results[filename][citation_counter] = copy.deepcopy(wordnet_score_dict)
                        citation_counter = citation_counter + 1
                        print("Citation counter : " + str(citation_counter))
                        sys.stdout.flush()


            # to execute only for few files
            no_of_cases = no_of_cases + 1
            print(str(no_of_cases) + " : " + filename)
            sys.stdout.flush()
            if no_of_cases == 3:
                break

        return self.results

    def get_eval_scores(self, results):
        doc_wise_results, doc_wise_ranking = self.sort_by_document(results)
        # print(doc_wise_results)
        print(self.evaluator.get_doc_wise_results(doc_wise_ranking))
        print("___________________________________________")
        final_scores = self.evaluator.get_total_scores()

        # print(final_scores)

        return final_scores