Example #1
0
 def abstract2vector(self, abstract):
     """
     Changes the abstract into a single averaged vector.
     :param abstract: the abstract to turn into a vector
     :return: a single vector representing the abstract
     """
     abstract_vecs = [useful_functions.sentence2vec(x) for x in abstract]
     avg = np.mean(abstract_vecs, axis=0)
     return avg
    def prepare_for_summarisation(self, filename, visualise=False):
        """
        Prepares a paper to be summarised by the Word2Vec method.
        :param filename: the filename of the paper to summarise
        :param visualise: true if visualising
        :return: the paper in a form suitable to be summarised with the trained models.
        """
        sentences = self.paper2orderedlist(filename)

        # Final form will be an ordered list of tuples, where each tuple shall have the form
        # (sentence_text, sentence_vector, abstract_vector, features).
        final_form = []

        raw_paper = useful_functions.read_in_paper(filename, sentences_as_lists=True)

        abstract = raw_paper["ABSTRACT"]
        abs_vector = self.abstract2vector(abstract)

        prev_section = ""

        try:
            bow = self.paper_bags_of_words[filename]
        except KeyError:
            paper_str = useful_functions.read_in_paper(filename)
            paper_str = " ".join([val for _, val in paper_str.iteritems()]).lower()
            paper_bag_of_words = useful_functions.calculate_bag_of_words(paper_str)
            self.paper_bags_of_words[filename] = paper_bag_of_words

        try:
            kf = self.keyphrases[filename]
        except KeyError:
            kfs = raw_paper["KEYPHRASES"]
            self.keyphrases[filename] = kfs

        for sentence, section in sentences:

            sentence_vector = useful_functions.sentence2vec(sentence, self.word2vec)

            features = self.calculate_features(sentence,
                                               self.paper_bags_of_words[filename],
                                               self.keyphrases[filename],
                                               [" ".join(x) for x in abstract],
                                               " ".join(raw_paper["MAIN-TITLE"][0]),
                                               section,
                                               shorter=True)

            if not visualise:
                final_form.append((sentence, sentence_vector, abs_vector, features))
            else:
                if prev_section != section:
                    print("----> Adding section: ", section)
                    final_form.append(([section], np.zeros_like(sentence_vector), np.zeros_like(sentence_vector), np.zeros_like(features)))
                    prev_section = section
                final_form.append((sentence, sentence_vector, abs_vector, features))

        return final_form
Example #3
0
def process_paper(filename):
    """
    The concurrent function which processes each paper into its training data format.
    :param filename: the filename of the paper to process.
    :return: none, but write the preprocessed file to "Data/Training_Data/"
    """
    #print("--> Started processing ", filename)

    # Start time
    start_time = time.time()

    # Read in the paper
    paper = useful_functions.read_in_paper(filename, sentences_as_lists=True)

    # Extract the gold summary
    gold = paper["HIGHLIGHTS"]
    gold_string_list = [" ".join(x) for x in gold]

    # Extract the title
    title = paper["MAIN-TITLE"][0]
    title_string = " ".join(title)

    # Extract the abstract
    abstract = paper["ABSTRACT"]
    abstract_string_list = [" ".join(x) for x in abstract]

    # Extract the keyphrases
    try:
        keyphrases = paper["KEYPHRASES"][0]
    except IndexError:
        keyphrases = []

    # Turn the paper into a single string and calculate the bag of words score
    paper_string = " ".join([" ".join(x) for key, val in paper.iteritems() for x in val])
    bag_of_words = useful_functions.calculate_bag_of_words(paper_string)

    # Get the paper as a list of sentences, associating each sentence with its section name - will be used by oracle
    # to find best summary sentences.
    paper_sentences = [(" ".join(x), key) for key, val in paper.iteritems() for x in val
                       if key != "ABSTRACT"]

    # Create a list of sentences, their ROUGE-L scores with the Highlights and the section they occur in
    # (as a string)
    sents_scores_secs = []

    for sentence, section in paper_sentences:
        # For some reason the candidate sentence needs to be the only item in a list
        r_score = rouge.calc_score([sentence], gold_string_list)

        sents_scores_secs.append((sentence.split(" "), r_score, section))

    # Sort the sentences, scores and sections into descending order
    sents_scores_secs = sorted(sents_scores_secs, key=itemgetter(1), reverse=True)

    pos_sents_scores_secs = sents_scores_secs[:num_summary]
    neg_sents_scores_secs = sents_scores_secs[num_summary:]

    if len(neg_sents_scores_secs) < len(pos_sents_scores_secs):
        print("{}**** NOT A SUFFICIENT AMOUNT OF DATA IN PAPER {}, IGNORING PAPER ****{}".format(
            Color.RED, filename, Color.END))
        return

    # Positive sentences
    positive_sents_secs_class = [(sent, sec, 1) for sent, _, sec in pos_sents_scores_secs]

    # Negative sentences

    # Take the sentences not used as positive and reverse it to have worst scores first then take an equal number
    neg_sents_scores_secs = [x for x in reversed(neg_sents_scores_secs)][:len(positive_sents_secs_class)]
    negative_sents_secs_class = [(sent, sec, 0) for sent, _, sec in neg_sents_scores_secs]

    # Don't create data from this paper if it's less than 40 sentences - i.e. there would be more positive than
    # negative data. The data needs to be balanced.
    #if len(positive_sents_secs_class) != len(negative_sents_secs_class):
    #    print("{}**** NOT A SUFFICIENT AMOUNT OF DATA IN PAPER {}, IGNORING PAPER ****{}".format(
    #        Color.RED, filename, Color.END))
    #    return

    # Concatenate the positive and negative sentences into a single data item and shuffle it
    data = positive_sents_secs_class + negative_sents_secs_class
    random.shuffle(data)

    # Average word vectors of each sentence and convert to list for JSON serialisation
    sentvecs_secs_class = [(useful_functions.sentence2vec(sent).tolist(), sec, y) for sent, sec, y in data]

    # Calculate features for each sentence
    features = [useful_functions.calculate_features(sent, bag_of_words, document_wordcount, keyphrases,
                                                    abstract_string_list, title_string, sec)
                for sent, sec, y in data]

    # Calculate abstract vector
    abs_vector = useful_functions.abstract2vector(abstract_string_list).tolist()

    # Description of the data
    description_text = "All text is of the form of a list of lists, where each sentence is a list of words. The" \
                       " sentences are of the form [(sentence (as a list of words), section in paper," \
                       " classification)]. The sentence vectors are of a similar form, except the sentence text is" \
                       " replaced with the vector representation of the sentence. The features are of the form " \
                       "[(AbstractROUGE, TF-IDF, Document_TF-IDF, keyphrase_score, title_score, numeric_score," \
                       " sentence_length, section)]. The dimensions of each sentence vector are [1x100]. The " \
                       "abstract vector is a single [1x100] vector also."

    # The data item that will be written for this paper
    data_item = {
        "filename": filename,
        "gold": gold,
        "title": paper["MAIN-TITLE"],
        "abstract": abstract,
        "abstract_vec": abs_vector,
        "sentences": data,
        "sentence_vecs": sentvecs_secs_class,
        "sentence_features": features,
        "description": description_text
    }

    # Write the data out
    with open(TRAINING_DATA_WRITE_LOC + filename.strip(".txt") + ".json", "wb") as f:
        json.dump(data_item, f)

    print("--> Finished processing {}, took {} seconds, data length: {}.".format(
        filename, (time.time() - start_time), len(data)))
Example #4
0
    def process_item(self, item):
        """
        Data item is of form:
        data = {
            "filename"
            "gold"
            "title"
            "abstract"
            "sentences"
            "description"
        }
        :param item: the data item to process
        :return: the processed data item
        """

        t = time.time()

        # Get the bag of words representation for this paper.
        bag_of_words = self.paper_bags_of_words[item["filename"]]

        # Get the keyphrases of this paper
        keyphrases = self.keyphrases[item["filename"]]

        # Get the abstract of this paper as a list of strings
        abstract = [" ".join(x) for x in item["abstract"]]

        # Get the title of this paper
        title = item["title"][0]

        # Get a vector representation of the abstract
        abs_vector = self.abstract2vector(abstract)

        # Get vector representations of each of the sentences
        sentence_vectors = [(useful_functions.sentence2vec(x), section, y)
                            for x, section, y in item["sentences"]]

        # Get feature representations of each of the sentences
        features = [
            self.calculate_features(x, bag_of_words, keyphrases, abstract,
                                    title, section, True)
            for x, section, y in item["sentences"]
        ]

        description_text = "All text is of the form of a list of lists, where each sentence is a list of words. The" \
                           " sentences are of the form [(sentence (as a list of words), section in paper," \
                           " classification)]. The sentence vectors are of a similar form, except the sentence text is" \
                           " replaced with the vector representation of the sentence. The features are of the form " \
                           "[(AbstractROUGE, TF-IDF, Document_TF-IDF, keyphrase_score, title_score, numeric_score," \
                           " sentence_length, section)]. The dimensions of each sentence vector are [1x100]. The " \
                           "abstract vector is a single [1x100] vector also."

        new_data = {
            "filename": item["filename"],
            "gold": item["gold"],
            "title": item["title"],
            "abstract": item["abstract"],
            "abstract_vec": abs_vector,
            "sentences": item["sentences"],
            "sentence_vecs": sentence_vectors,
            "sentence_features": features,
            "description": description_text
        }

        print("Done, process took ",
              time.time() - t, " seconds, time since start is ",
              (time.time() - self.start_time) / 60, " minutes")

        return new_data