コード例 #1
0
    def prepare_for_summarisation(self, filename, visualise=False):
        """
        Prepares a paper to be summarised by the Word2Vec method.
        :param filename: the filename of the paper to summarise
        :param visualise: true if visualising
        :return: the paper in a form suitable to be summarised with the trained models.
        """
        sentences = self.paper2orderedlist(filename)

        # Final form will be an ordered list of tuples, where each tuple shall have the form
        # (sentence_text, sentence_vector, abstract_vector, features).
        final_form = []

        raw_paper = useful_functions.read_in_paper(filename, sentences_as_lists=True)

        abstract = raw_paper["ABSTRACT"]
        abs_vector = self.abstract2vector(abstract)

        prev_section = ""

        try:
            bow = self.paper_bags_of_words[filename]
        except KeyError:
            paper_str = useful_functions.read_in_paper(filename)
            paper_str = " ".join([val for _, val in paper_str.iteritems()]).lower()
            paper_bag_of_words = useful_functions.calculate_bag_of_words(paper_str)
            self.paper_bags_of_words[filename] = paper_bag_of_words

        try:
            kf = self.keyphrases[filename]
        except KeyError:
            kfs = raw_paper["KEYPHRASES"]
            self.keyphrases[filename] = kfs

        for sentence, section in sentences:

            sentence_vector = useful_functions.sentence2vec(sentence, self.word2vec)

            features = self.calculate_features(sentence,
                                               self.paper_bags_of_words[filename],
                                               self.keyphrases[filename],
                                               [" ".join(x) for x in abstract],
                                               " ".join(raw_paper["MAIN-TITLE"][0]),
                                               section,
                                               shorter=True)

            if not visualise:
                final_form.append((sentence, sentence_vector, abs_vector, features))
            else:
                if prev_section != section:
                    print("----> Adding section: ", section)
                    final_form.append(([section], np.zeros_like(sentence_vector), np.zeros_like(sentence_vector), np.zeros_like(features)))
                    prev_section = section
                final_form.append((sentence, sentence_vector, abs_vector, features))

        return final_form
コード例 #2
0
 def prepare_paper(self, filename):
     """
     Prepares the paper for summarisation.
     :return: The paper in a form suitable for summarisation
     """
     paper = useful_functions.read_in_paper(filename, sentences_as_lists=True, preserve_order=True)
     return paper
コード例 #3
0
    def paper2orderedlist(self, filename):
        """
        Performs the first task necessary to summarise a paper: turning it into an ordered list of sentences which
        doesn't include the highlights or abstract section of the paper (as these are already summaries).
        :param filename: the filename to summarise.
        :return: the paper as an ordered list of sentences, not including abstract or highlights.
        """
        paper = useful_functions.read_in_paper(filename, sentences_as_lists=True, preserve_order=True)

        # We don't want to make any predictions for the Abstract or Highlights as these are already summaries.
        sections_to_predict_for = []
        for section, text in paper.iteritems():

            if section != "ABSTRACT" and section != "HIGHLIGHTS":
                sections_to_predict_for.append(text)

        # Sorts the sections according to the order in which they appear in the paper.
        sorted_sections_to_predict_for = sorted(sections_to_predict_for, key=itemgetter(1))

        # Creates an ordered list of the sentences in the paper
        sentence_list = []
        for sentence_text, section_position_in_paper in sorted_sections_to_predict_for:
            section_sentences = sentence_text
            for sentence in section_sentences:
                sentence_list.append(sentence)

        return sentence_list
コード例 #4
0
def dump_keyphrases_pkl(out_file):
    files = os.listdir(BASE_DIR +
                       "Data/Papers/Full/Papers_With_Section_Titles/")
    keyphrases = {}
    for file in files:
        paper = useful_functions.read_in_paper(file, sentences_as_lists=False)
        keyphrases[file] = paper["KEYPHRASES"]
    write_pkl(keyphrases, out_file)
コード例 #5
0
def dump_bow_pkl(out_file):
    files = os.listdir(BASE_DIR +
                       "Data/Papers/Full/Papers_With_Section_Titles/")
    bow = {}
    for file in files:
        paper = useful_functions.read_in_paper(file, sentences_as_lists=False)
        paper_string = " ".join(v for k, v in paper.iteritems())
        bow[file] = useful_functions.calculate_bag_of_words(paper_string)
    write_pkl(bow, out_file)
コード例 #6
0
                                               preserve_order=True)
        return paper


if __name__ == "__main__":
    # Paper One: S0168874X14001395.txt
    # Paper Two: S0141938215300044.txt
    # Paper Three: S0142694X15000423.txt
    summ = AbstractRougeSummariser()
    #summ.summarise("S0142694X15000423.txt")

    count = 0
    for filename in os.listdir(PAPER_SOURCE):
        if count > 150:
            break
        if filename.endswith(".txt") and count > 0:

            # We need to write the highlights as a gold summary with the same name as the generated summary.
            highlights = useful_functions.read_in_paper(filename,
                                                        True)["HIGHLIGHTS"]
            useful_functions.write_gold(SUMMARY_WRITE_LOC, highlights,
                                        filename)

            # Display a loading bar of progress
            useful_functions.loading_bar(LOADING_SECTION_SIZE, count,
                                         NUMBER_OF_PAPERS)

            # Generate and write a summary
            summ.summarise(filename)

        count += 1
コード例 #7
0
        tf_idfs.append(word_tf_idf)

    return [x for x in zip(sentence, tf_idfs)]


with open(BASE_DIR + "/Visualisations/base_html.txt", "rb") as f:
    html = f.readlines()

html.append("<body>\n")
html.append("<div class=\"container\">\n")
html.append("<div id=\"title\" class=\"text\">")

filename = "S0920548915000744.txt"
paper = useful_functions.read_in_paper(filename,
                                       sentences_as_lists=True,
                                       preserve_order=True)

html.append("<h1>" + " ".join(paper["MAIN-TITLE"][0][0]) + "</h1>")
html.append("</div>")
html.append("<div id=\"gold\" class=\"text\">")
html.append("<h2>Human Written Summary</h2>")
html.append("<hr>")
html.append("<br>")
html.append("<p>")

highlights = paper["HIGHLIGHTS"]
print("Reading stuff...")
bag_of_words = defaultdict(float)
for key, val in paper.iteritems():
    sents = val[0]
コード例 #8
0
def process_paper(filename):
    """
    The concurrent function which processes each paper into its training data format.
    :param filename: the filename of the paper to process.
    :return: none, but write the preprocessed file to "Data/Training_Data/"
    """
    #print("--> Started processing ", filename)

    # Start time
    start_time = time.time()

    # Read in the paper
    paper = useful_functions.read_in_paper(filename, sentences_as_lists=True)

    # Extract the gold summary
    gold = paper["HIGHLIGHTS"]
    gold_string_list = [" ".join(x) for x in gold]

    # Extract the title
    title = paper["MAIN-TITLE"][0]
    title_string = " ".join(title)

    # Extract the abstract
    abstract = paper["ABSTRACT"]
    abstract_string_list = [" ".join(x) for x in abstract]

    # Extract the keyphrases
    try:
        keyphrases = paper["KEYPHRASES"][0]
    except IndexError:
        keyphrases = []

    # Turn the paper into a single string and calculate the bag of words score
    paper_string = " ".join([" ".join(x) for key, val in paper.iteritems() for x in val])
    bag_of_words = useful_functions.calculate_bag_of_words(paper_string)

    # Get the paper as a list of sentences, associating each sentence with its section name - will be used by oracle
    # to find best summary sentences.
    paper_sentences = [(" ".join(x), key) for key, val in paper.iteritems() for x in val
                       if key != "ABSTRACT"]

    # Create a list of sentences, their ROUGE-L scores with the Highlights and the section they occur in
    # (as a string)
    sents_scores_secs = []

    for sentence, section in paper_sentences:
        # For some reason the candidate sentence needs to be the only item in a list
        r_score = rouge.calc_score([sentence], gold_string_list)

        sents_scores_secs.append((sentence.split(" "), r_score, section))

    # Sort the sentences, scores and sections into descending order
    sents_scores_secs = sorted(sents_scores_secs, key=itemgetter(1), reverse=True)

    pos_sents_scores_secs = sents_scores_secs[:num_summary]
    neg_sents_scores_secs = sents_scores_secs[num_summary:]

    if len(neg_sents_scores_secs) < len(pos_sents_scores_secs):
        print("{}**** NOT A SUFFICIENT AMOUNT OF DATA IN PAPER {}, IGNORING PAPER ****{}".format(
            Color.RED, filename, Color.END))
        return

    # Positive sentences
    positive_sents_secs_class = [(sent, sec, 1) for sent, _, sec in pos_sents_scores_secs]

    # Negative sentences

    # Take the sentences not used as positive and reverse it to have worst scores first then take an equal number
    neg_sents_scores_secs = [x for x in reversed(neg_sents_scores_secs)][:len(positive_sents_secs_class)]
    negative_sents_secs_class = [(sent, sec, 0) for sent, _, sec in neg_sents_scores_secs]

    # Don't create data from this paper if it's less than 40 sentences - i.e. there would be more positive than
    # negative data. The data needs to be balanced.
    #if len(positive_sents_secs_class) != len(negative_sents_secs_class):
    #    print("{}**** NOT A SUFFICIENT AMOUNT OF DATA IN PAPER {}, IGNORING PAPER ****{}".format(
    #        Color.RED, filename, Color.END))
    #    return

    # Concatenate the positive and negative sentences into a single data item and shuffle it
    data = positive_sents_secs_class + negative_sents_secs_class
    random.shuffle(data)

    # Average word vectors of each sentence and convert to list for JSON serialisation
    sentvecs_secs_class = [(useful_functions.sentence2vec(sent).tolist(), sec, y) for sent, sec, y in data]

    # Calculate features for each sentence
    features = [useful_functions.calculate_features(sent, bag_of_words, document_wordcount, keyphrases,
                                                    abstract_string_list, title_string, sec)
                for sent, sec, y in data]

    # Calculate abstract vector
    abs_vector = useful_functions.abstract2vector(abstract_string_list).tolist()

    # Description of the data
    description_text = "All text is of the form of a list of lists, where each sentence is a list of words. The" \
                       " sentences are of the form [(sentence (as a list of words), section in paper," \
                       " classification)]. The sentence vectors are of a similar form, except the sentence text is" \
                       " replaced with the vector representation of the sentence. The features are of the form " \
                       "[(AbstractROUGE, TF-IDF, Document_TF-IDF, keyphrase_score, title_score, numeric_score," \
                       " sentence_length, section)]. The dimensions of each sentence vector are [1x100]. The " \
                       "abstract vector is a single [1x100] vector also."

    # The data item that will be written for this paper
    data_item = {
        "filename": filename,
        "gold": gold,
        "title": paper["MAIN-TITLE"],
        "abstract": abstract,
        "abstract_vec": abs_vector,
        "sentences": data,
        "sentence_vecs": sentvecs_secs_class,
        "sentence_features": features,
        "description": description_text
    }

    # Write the data out
    with open(TRAINING_DATA_WRITE_LOC + filename.strip(".txt") + ".json", "wb") as f:
        json.dump(data_item, f)

    print("--> Finished processing {}, took {} seconds, data length: {}.".format(
        filename, (time.time() - start_time), len(data)))
コード例 #9
0
    def prepare_data(self):
        """
        Puts the data in a form suitable for the Word2Vec classifier - it changes each sentence into the average of
        its constituent word vectors.
        :return: all sentences as vectors and their classification (data is balanced).
        """

        # Count of how many papers have been processed.
        count = 0

        # Sentences as vectors with their classification
        data = []

        # Count of positive data
        pos_count = 0

        # Count of negative data
        neg_count = 0

        r = Rouge()

        # Iterate over every file in the paper directory
        for filename in os.listdir(PAPER_SOURCE):

            # Ignores files which are not papers e.g. hidden files
            if filename.endswith(".txt"):

                # Display a loading bar of progress
                useful_functions.loading_bar(self.loading_section_size, count,
                                             self.number_of_papers)
                count += 1

                # Opens the paper as a dictionary, with keys corresponding to section titles and values corresponding
                # to the text in that section. The text is given as a list of lists, each list being a list of words
                # corresponding to a sentence.
                paper = useful_functions.read_in_paper(filename,
                                                       sentences_as_lists=True)

                # Get the highlights of the paper
                highlights = paper["HIGHLIGHTS"]
                highlights_join = [" ".join(x) for x in highlights]
                abstract = paper["ABSTRACT"]

                sentences = []

                # Iterate over the whole paper
                for section, sents in paper.iteritems():

                    # Iterate over each sentence in the section
                    for sentence in sents:

                        # We don't want to calculate ROUGE for the abstract
                        if section != "ABSTRACT":
                            # Calculate the ROUGE score and add it to the list
                            r_score = r.calc_score([" ".join(sentence)],
                                                   highlights_join)
                            sentences.append((sentence, r_score, section))

                sentences = [(x, section) for x, score, section in reversed(
                    sorted(sentences, key=itemgetter(1)))]

                sents_pos = sentences[0:self.num_summary]
                sents_neg = sentences[self.num_summary:]

                if len(sents_neg) < len(sents_pos):
                    continue

                sents_pos = [(x[0], x[1], y)
                             for x, y in zip(sents_pos, [1] * len(sents_pos))]
                sents_neg = [x for x in reversed(sents_neg)][:len(sents_pos)]
                sents_neg = [(x[0], x[1], y)
                             for x, y in zip(sents_neg, [0] * len(sents_neg))]
                sents_class = sents_pos + sents_neg
                random.shuffle(sents_class)

                # Each item in the sentence list has form [(sentence, section, classification)]
                paper = {
                    "filename":
                    filename,
                    "title":
                    paper["MAIN-TITLE"],
                    "gold":
                    paper["HIGHLIGHTS"],
                    "abstract":
                    abstract,
                    "sentences":
                    sents_class,
                    "description":
                    "All text data is given in the form of a list of words."
                }

                data.append(paper)

                if count % 1000 == 0:
                    print("\nWriting data...")
                    write_dir = BASE_DIR + "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/"
                    with open(write_dir + "data.pkl", "wb") as f:
                        pickle.dump(data, f)
                    print("Done")

        return data