Esempio n. 1
0
                                               preserve_order=True)
        return paper


if __name__ == "__main__":
    # Paper One: S0168874X14001395.txt
    # Paper Two: S0141938215300044.txt
    # Paper Three: S0142694X15000423.txt
    summ = AbstractRougeSummariser()
    #summ.summarise("S0142694X15000423.txt")

    count = 0
    for filename in os.listdir(PAPER_SOURCE):
        if count > 150:
            break
        if filename.endswith(".txt") and count > 0:

            # We need to write the highlights as a gold summary with the same name as the generated summary.
            highlights = useful_functions.read_in_paper(filename,
                                                        True)["HIGHLIGHTS"]
            useful_functions.write_gold(SUMMARY_WRITE_LOC, highlights,
                                        filename)

            # Display a loading bar of progress
            useful_functions.loading_bar(LOADING_SECTION_SIZE, count,
                                         NUMBER_OF_PAPERS)

            # Generate and write a summary
            summ.summarise(filename)

        count += 1
Esempio n. 2
0
    def prepare_data(self):
        """
        Puts the data in a form suitable for the Word2Vec classifier - it changes each sentence into the average of
        its constituent word vectors.
        :return: all sentences as vectors and their classification (data is balanced).
        """

        # Count of how many papers have been processed.
        count = 0

        # Sentences as vectors with their classification
        data = []

        # Count of positive data
        pos_count = 0

        # Count of negative data
        neg_count = 0

        r = Rouge()

        # Iterate over every file in the paper directory
        for filename in os.listdir(PAPER_SOURCE):

            # Ignores files which are not papers e.g. hidden files
            if filename.endswith(".txt"):

                # Display a loading bar of progress
                useful_functions.loading_bar(self.loading_section_size, count,
                                             self.number_of_papers)
                count += 1

                # Opens the paper as a dictionary, with keys corresponding to section titles and values corresponding
                # to the text in that section. The text is given as a list of lists, each list being a list of words
                # corresponding to a sentence.
                paper = useful_functions.read_in_paper(filename,
                                                       sentences_as_lists=True)

                # Get the highlights of the paper
                highlights = paper["HIGHLIGHTS"]
                highlights_join = [" ".join(x) for x in highlights]
                abstract = paper["ABSTRACT"]

                sentences = []

                # Iterate over the whole paper
                for section, sents in paper.iteritems():

                    # Iterate over each sentence in the section
                    for sentence in sents:

                        # We don't want to calculate ROUGE for the abstract
                        if section != "ABSTRACT":
                            # Calculate the ROUGE score and add it to the list
                            r_score = r.calc_score([" ".join(sentence)],
                                                   highlights_join)
                            sentences.append((sentence, r_score, section))

                sentences = [(x, section) for x, score, section in reversed(
                    sorted(sentences, key=itemgetter(1)))]

                sents_pos = sentences[0:self.num_summary]
                sents_neg = sentences[self.num_summary:]

                if len(sents_neg) < len(sents_pos):
                    continue

                sents_pos = [(x[0], x[1], y)
                             for x, y in zip(sents_pos, [1] * len(sents_pos))]
                sents_neg = [x for x in reversed(sents_neg)][:len(sents_pos)]
                sents_neg = [(x[0], x[1], y)
                             for x, y in zip(sents_neg, [0] * len(sents_neg))]
                sents_class = sents_pos + sents_neg
                random.shuffle(sents_class)

                # Each item in the sentence list has form [(sentence, section, classification)]
                paper = {
                    "filename":
                    filename,
                    "title":
                    paper["MAIN-TITLE"],
                    "gold":
                    paper["HIGHLIGHTS"],
                    "abstract":
                    abstract,
                    "sentences":
                    sents_class,
                    "description":
                    "All text data is given in the form of a list of words."
                }

                data.append(paper)

                if count % 1000 == 0:
                    print("\nWriting data...")
                    write_dir = BASE_DIR + "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/"
                    with open(write_dir + "data.pkl", "wb") as f:
                        pickle.dump(data, f)
                    print("Done")

        return data