Python AbstractNetPreprocessor 예제들, Dev.DataTools.DataPreprocessing.AbstractNetPreprocessor.AbstractNetPreprocessor Python 예제들

예제 #1

0

파일 보기

파일: KeyphraseScoreSummariser.py 프로젝트: satishpasumarthi/TextSum

 def __init__(self):
     """
     ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
     summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
     """
     self.summary_length = 10
     self.r = Rouge()
     self.preprocessor = AbstractNetPreprocessor()

예제 #2

0

파일 보기

파일: FeaturesNoAbsRougeSummariser.py 프로젝트: satishpasumarthi/TextSum

 def __init__(self):
     """
     ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
     summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
     """
     self.summary_length = 10
     self.r = Rouge()
     self.preprocessor = AbstractNetPreprocessor()
     self.computation_graph = graph()
     self.features_input = self.computation_graph["features_input"]
     self.prediction_probs = self.computation_graph["prediction_probs"]
     self.similarity_threshold = 0.75

예제 #3

0

파일 보기

    def __init__(self):
        """
        ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
        summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
        """
        self.summary_length = 10
        self.min_sent_len = 10
        self.r = Rouge()
        self.preprocessor = AbstractNetPreprocessor()

        # Hyperparameter to tune weight given to feature probability
        self.C = 0.30

예제 #4

0

파일 보기

파일: SummariserNetV2Summariser.py 프로젝트: callaghanmt/summaries

class SummariserNetV2Summariser(Summariser):
    """
    Implements a logistic regression summariser that used a logistic regression classifier to tell if sentences are
    summary sentences or not.
    """

    def __init__(self):
        """
        ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
        summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
        """
        self.summary_length = 10
        self.r = Rouge()
        self.preprocessor = AbstractNetPreprocessor()
        self.computation_graph = graph()
        self.sentence_input = self.computation_graph["sentence_input"]
        self.features_input = self.computation_graph["features_input"]
        self.seq_lens = self.computation_graph["sequence_lengths"]
        self.prediction_probs = self.computation_graph["raw_predictions"]
        self.keep_prob = self.computation_graph["keep_prob"]
        self.similarity_threshold = 0.75


    def summarise(self, filename):
        """
        Generates a summary of the paper.
        :param filename: the name of the file to summaries
        :return: a sumamry of the paper.
        """

        # Each item has form (sentence, sentence_vector, abstract_vector, features)
        paper = self.prepare_paper(filename)

        # ========> Code from here on is summariser specific <========

        with tf.Session() as sess:

            # Initialise all variables
            sess.run(tf.global_variables_initializer())

            # Saving object
            saver = tf.train.Saver()

            # Restore the saved model
            saver.restore(sess, SAVE_PATH)

            # Stores sentences, the probability of them being good summaries and their position in the paper
            sentences_and_summary_probs = []

            # Number of sentences in the paper
            num_sents = len(paper)

            # ----> Create the matrix for sentences for the LSTM <----
            sentence_list = []

            for sent, sent_vec, abs_vec, feats in paper:
                if len(sent) < MAX_SENT_LEN:
                    sentence_list.append(sent)
                else:
                    sentence_list.append(sent[0:MAX_SENT_LEN])

            # Get the matrix representation of the sentences
            sentence_matrix, sent_lens = sents2input(sentence_list, num_sents)

            # ----> Create the matrix of features for the LSTM <----
            feature_matrix = np.zeros((num_sents, NUM_FEATURES), dtype=np.float32)

            i = 0
            for _, _, _, feat in paper:
                feature_matrix[i, :] = feat
                i += 1

            # Create the feed_dict
            feed_dict = {
                self.sentence_input: sentence_matrix,
                self.features_input: feature_matrix,
                self.seq_lens: sent_lens,
                self.keep_prob: 1
            }

            # Predict how good a summary each sentence is using the computation graph
            probs = sess.run(self.prediction_probs, feed_dict=feed_dict)

            # Store the sentences and probabilities in a list to be sorted
            for i in range(num_sents):
                sentence = paper[i][0]
                sentence_vec = paper[i][1]
                prob = probs[i][1]
                sentences_and_summary_probs.append((sentence, sentence_vec, prob, i))

            # This list is now sorted by the probability of the sentence being a good summary sentence
            sentences_and_summary_probs = [x for x in reversed(sorted(sentences_and_summary_probs, key=itemgetter(2)))]

            summary = []
            for sent, sent_vec, prob, pos in sentences_and_summary_probs:
                if len(summary) > self.summary_length:
                    break

                if len(sent) < 10:
                    continue
                else:
                    summary.append((sent, sent_vec, prob, pos))

            #summary = sentences_and_summary_probs[0:self.summary_length]

            # Order sumamry sentences according to the order they appear in the paper
            ordered_summary = sorted(summary, key=itemgetter(-1))

            # Print the summary
            summary = []

            for sentence, sentence_vec, prob, pos in ordered_summary:
                sentence = " ".join(sentence)
                summary.append((sentence, pos))

        useful_functions.write_summary(SUMMARY_WRITE_LOC, summary, filename.strip(".txt"))

        #for sentence in summary:
        #    print(sentence)
        #    print()


    def load_model(self):
        """
        Loads the classification model
        :return: the classification model
        """
        pass

    def prepare_paper(self, filename):
        """
        Prepares the paper for summarisation.
        :return: The paper in a form suitable for summarisation
        """
        paper = self.preprocessor.prepare_for_summarisation(filename)
        return paper

예제 #5

0

파일 보기

파일: KeyphraseScoreSummariser.py 프로젝트: satishpasumarthi/TextSum

class KeyphraseScoreSummariser(Summariser):
    """
    Implements a logistic regression summariser that used a logistic regression classifier to tell if sentences are
    summary sentences or not.
    """
    def __init__(self):
        """
        ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
        summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
        """
        self.summary_length = 10
        self.r = Rouge()
        self.preprocessor = AbstractNetPreprocessor()

    def summarise(self, filename):
        """
        Generates a summary of the paper.
        :param filename: the name of the file to summaries
        :return: a sumamry of the paper.
        """

        # Each item has form (sentence, sentence_vector, abstract_vector, features)
        paper = self.prepare_paper(filename)

        # ========> Code from here on is summariser specific <========

        num_sents = len(paper)

        sentences_and_summary_probs = []

        # Store the sentences and probabilities in a list to be sorted
        for i in range(num_sents):
            sentence = paper[i][0]
            sentence_vec = paper[i][1]
            prob = paper[i][3][3]
            sentences_and_summary_probs.append(
                (sentence, sentence_vec, prob, i))

        # This list is now sorted by the probability of the sentence being a good summary sentence
        sentences_and_summary_probs = [
            x for x in reversed(
                sorted(sentences_and_summary_probs, key=itemgetter(2)))
        ]

        summary = sentences_and_summary_probs[0:self.summary_length]

        # Order summary sentences according to the order they appear in the paper
        ordered_summary = sorted(summary, key=itemgetter(-1))

        # Print the summary
        summary = []

        for sentence, sentence_vec, prob, pos in ordered_summary:
            sentence = " ".join(sentence)
            summary.append((sentence, pos))

        useful_functions.write_summary(SUMMARY_WRITE_LOC, summary,
                                       filename.strip(".txt"))

        for sentence in summary:
            print(sentence)
            print()

    def load_model(self):
        """
        Loads the classification model
        :return: the classification model
        """
        pass

    def prepare_paper(self, filename):
        """
        Prepares the paper for summarisation.
        :return: The paper in a form suitable for summarisation
        """
        paper = self.preprocessor.prepare_for_summarisation(filename)
        return paper

예제 #6

0

파일 보기

파일: FeaturesNoAbsRougeSummariser.py 프로젝트: satishpasumarthi/TextSum

class FeaturesNoAbsRougeSummariser(Summariser):
    """
    Implements a logistic regression summariser that used a logistic regression classifier to tell if sentences are
    summary sentences or not.
    """
    def __init__(self):
        """
        ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
        summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
        """
        self.summary_length = 10
        self.r = Rouge()
        self.preprocessor = AbstractNetPreprocessor()
        self.computation_graph = graph()
        self.features_input = self.computation_graph["features_input"]
        self.prediction_probs = self.computation_graph["prediction_probs"]
        self.similarity_threshold = 0.75

    def summarise(self, filename):
        """
        Generates a summary of the paper.
        :param filename: the name of the file to summaries
        :return: a sumamry of the paper.
        """

        # Each item has form (sentence, sentence_vector, abstract_vector, features)
        paper = self.prepare_paper(filename)

        # ========> Code from here on is summariser specific <========

        with tf.Session() as sess:

            # Initialise all variables
            sess.run(tf.global_variables_initializer())

            # Saving object
            saver = tf.train.Saver()

            # Restore the saved model
            saver.restore(sess, SAVE_PATH)

            # Stores sentences, the probability of them being good summaries and their position in the paper
            sentences_and_summary_probs = []

            # Create a matrix of all of the features in the paper so that we can predict summary probabilities for
            # the whole paper at once
            num_sents = len(paper)
            feed_feats = np.zeros((num_sents, NUM_FEATURES), dtype=np.float32)
            for i, item in enumerate(paper):
                feed_feats[i, :] = item[3][1:]

            # Predict how good a summary each sentence is using the computation graph
            probs = sess.run(self.prediction_probs,
                             feed_dict={self.features_input: feed_feats})

            # Store the sentences and probabilities in a list to be sorted
            for i in range(num_sents):
                sentence = paper[i][0]
                sentence_vec = paper[i][1]
                prob = probs[i][1]
                sentences_and_summary_probs.append(
                    (sentence, sentence_vec, prob, i))

            # This list is now sorted by the probability of the sentence being a good summary sentence
            sentences_and_summary_probs = [
                x for x in reversed(
                    sorted(sentences_and_summary_probs, key=itemgetter(2)))
            ]

            summary = sentences_and_summary_probs[0:self.summary_length]

            # Order summary sentences according to the order they appear in the paper
            ordered_summary = sorted(summary, key=itemgetter(-1))

            # Print the summary
            summary = []

            for sentence, sentence_vec, prob, pos in ordered_summary:
                sentence = " ".join(sentence)
                summary.append((sentence, pos))

        useful_functions.write_summary(SUMMARY_WRITE_LOC, summary,
                                       filename.strip(".txt"))

        for sentence in summary:
            print(sentence)
            print()

    def load_model(self):
        """
        Loads the classification model
        :return: the classification model
        """
        pass

    def prepare_paper(self, filename):
        """
        Prepares the paper for summarisation.
        :return: The paper in a form suitable for summarisation
        """
        paper = self.preprocessor.prepare_for_summarisation(filename)
        return paper

예제 #7

0

파일 보기

class EnsembleV2Summariser(Summariser):
    """
    Implements a logistic regression summariser that used a logistic regression classifier to tell if sentences are
    summary sentences or not.
    """
    def __init__(self):
        """
        ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
        summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
        """
        self.summary_length = 10
        self.min_sent_len = 10
        self.r = Rouge()
        self.preprocessor = AbstractNetPreprocessor()

        # Hyperparameter to tune weight given to feature probability
        self.C = 0.30

    def summarise(self, filename):
        """
        Generates a summary of the paper.
        :param filename: the name of the file to summaries
        :return: a sumamry of the paper.
        """

        # Each item has form (sentence, sentence_vector, abstract_vector, features)
        paper = self.prepare_paper(filename)

        # ========> Code from here on is summariser specific <========

        # Stores sentences, the probability of them being good summaries and their position in the paper
        sentences_and_summary_probs = []

        # Summary according to features
        sentences_feat_summary_probs = []

        tf.reset_default_graph()
        computation_graph = lstm_classifier.graph()
        sentence_input = computation_graph["inputs"]
        seq_lens = computation_graph["sequence_lengths"]
        prediction_probs = computation_graph["prediction_probs"]
        keep_prob = computation_graph["keep_prob"]

        with tf.Session() as sess:

            # Initialise all variables
            sess.run(tf.global_variables_initializer())

            # Saving object
            saver = tf.train.Saver()

            # Restore the saved model
            saver.restore(sess, lstm_classifier.SAVE_PATH)

            # Number of sentences in the paper
            num_sents = len(paper)

            # ----> Create the matrix for sentences for the LSTM <----
            sentence_list = []

            for sent, sent_vec, abs_vec, feats in paper:
                if len(sent) < MAX_SENT_LEN:
                    sentence_list.append(sent)
                else:
                    sentence_list.append(sent[0:MAX_SENT_LEN])

            # Get the matrix representation of the sentences
            sentence_matrix, sent_lens = sents2input(sentence_list, num_sents)

            # Create the feed_dict
            feed_dict = {
                sentence_input: sentence_matrix,
                seq_lens: sent_lens,
                keep_prob: 1
            }

            # Predict how good a summary each sentence is using the computation graph
            probs = sess.run(prediction_probs, feed_dict=feed_dict)

            # Store the sentences and probabilities in a list to be sorted
            for i in range(num_sents):
                sentence = paper[i][0]
                sentence_vec = paper[i][1]
                prob = probs[i][1]
                sentences_and_summary_probs.append(
                    (sentence, sentence_vec, prob, i))

        tf.reset_default_graph()
        features_graph = features_mlp.graph()
        features_classifier_input = features_graph["features_input"]
        features_prediction_probs = features_graph["prediction_probs"]
        with tf.Session() as sess:

            # Initialise all variables
            sess.run(tf.global_variables_initializer())

            # Saving object
            saver = tf.train.Saver()

            # ====> Run the second graph <====
            saver.restore(sess, features_mlp.SAVE_PATH)

            # ----> Create the matrix of features for the LSTM <----
            feature_matrix = np.zeros((num_sents, NUM_FEATURES),
                                      dtype=np.float32)

            i = 0
            for _, _, _, feat in paper:
                feature_matrix[i, :] = feat
                i += 1

            # Predict how good a summary each sentence is using the computation graph
            probs = sess.run(
                features_prediction_probs,
                feed_dict={features_classifier_input: feature_matrix})

            # Store the sentences and probabilities in a list to be sorted
            for i in range(num_sents):
                sentence = paper[i][0]
                sentence_vec = paper[i][1]
                prob = probs[i][1]
                sentences_feat_summary_probs.append(
                    (sentence, sentence_vec, prob, i))

        # ====> Combine the results <====

        # This list is now sorted by the probability of the sentence being a good summary sentence
        #sentences_and_summary_probs = [x for x in reversed(sorted(sentences_and_summary_probs, key=itemgetter(2)))]

        # Sort features list in probability order
        #sentences_feat_summary_probs = [x for x in reversed(sorted(sentences_feat_summary_probs, key=itemgetter(2)))]

        summary = []
        sents_already_added = set()

        # ====> Attempt Four <====
        final_sents_probs = []

        for item in zip(sentences_feat_summary_probs,
                        sentences_and_summary_probs):
            prob_summNet = item[1][2] * (1 - self.C)
            prob_Features = item[0][2] * (1 + self.C)
            avg_prob = (prob_summNet + prob_Features) / 2
            final_sents_probs.append(
                (item[0][0], item[0][1], avg_prob, item[0][3]))

        final_sents_probs = [
            x for x in reversed(sorted(final_sents_probs, key=itemgetter(2)))
        ]

        summary = final_sents_probs[0:self.summary_length]
        """
        # ====> Attempt Three <====
        # Take summary sentences from features
        summary = sentences_feat_summary_probs[0:self.summary_length]
        for item in summary:
            sents_already_added.add(item[3])

        # Add ones from summary net if it's sure of them and they aren't there already
        max_additional = 5
        count_additional = 0
        for item in sentences_and_summary_probs:
            if count_additional > max_additional:
                break
            if item[3] not in sents_already_added and item[2] > 0.95:
                summary.append(item)
                sents_already_added.add(item[3])
                count_additional += 1
        """
        """
        # ====> Attempt Two <====
        i = 0
        while len(summary) < self.summary_length:

            if i >= len(sentences_feat_summary_probs) and i >= len(sentences_and_summary_probs):
                break

            feats = sentences_feat_summary_probs[i]
            summNet = sentences_and_summary_probs[i]

            feats_prob = feats[2]
            summNet_prob = summNet[2]

            if feats_prob >= summNet_prob and feats[3] not in sents_already_added:
                summary.append(feats)
                sents_already_added.add(feats[3])
            elif summNet_prob > feats_prob and summNet[3] not in sents_already_added:
                summary.append(summNet)
                sents_already_added.add(summNet[3])

            i += 1
        """
        """
        # ====> Attempt One <====
        # True to select a summary sentence from summ_net, false to select from features
        summ_net = True
        for i in range(num_sents):

            if len(summary) >= self.summary_length \
                    or len(sentences_and_summary_probs) <= 0 \
                    or len(sentences_feat_summary_probs) <= 0:
                break

            added = False

            if summ_net:

                while not added:

                    if len(sentences_and_summary_probs) <= 0:
                        break

                    highest_prob = sentences_and_summary_probs.pop(0)
                    if highest_prob[3] in sents_already_added or len(highest_prob[0]) < self.min_sent_len:
                        continue
                    else:
                        summary.append(highest_prob)
                        sents_already_added.add(highest_prob[3])
                        added = True

                summ_net = False

            else:

                while not added:

                    if len(sentences_feat_summary_probs) <= 0:
                        break

                    highest_prob = sentences_feat_summary_probs.pop(0)
                    if highest_prob[3] in sents_already_added or len(highest_prob[0]) < self.min_sent_len:
                        continue
                    else:
                        summary.append(highest_prob)
                        sents_already_added.add(highest_prob[3])
                        added = True

                summ_net = True
        """

        # Order sumamry sentences according to the order they appear in the paper
        ordered_summary = sorted(summary, key=itemgetter(-1))

        # Print the summary
        summary = []

        for sentence, sentence_vec, prob, pos in ordered_summary:
            sentence = " ".join(sentence)
            summary.append((sentence, pos))

        useful_functions.write_summary(SUMMARY_WRITE_LOC, summary,
                                       filename.strip(".txt"))

        for sentence in summary:
            print(sentence)
            print()

    def load_model(self):
        """
        Loads the classification model
        :return: the classification model
        """
        pass

    def prepare_paper(self, filename):
        """
        Prepares the paper for summarisation.
        :return: The paper in a form suitable for summarisation
        """
        paper = self.preprocessor.prepare_for_summarisation(filename)
        return paper

예제 #8

0

파일 보기

파일: SummaRunnerSummariser.py 프로젝트: satishpasumarthi/TextSum

class SummariserNetSummariser(Summariser):
    """
    SAFNet Architecture
    """
    def __init__(self):
        """
        ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
        summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
        """
        self.summary_length = 10
        self.r = Rouge()
        self.preprocessor = AbstractNetPreprocessor()
        self.computation_graph = graph()
        self.sentence_input = self.computation_graph["sentence_input"]
        self.abstract_input = self.computation_graph["abstract_input"]
        self.features_input = self.computation_graph["features_input"]
        self.seq_lens = self.computation_graph["sequence_lengths"]
        self.prediction_probs = self.computation_graph["raw_predictions"]
        self.keep_prob = self.computation_graph["keep_prob"]
        self.similarity_threshold = 0.75

    def summarise(self, filename):
        """
        Generates a summary of the paper.
        :param filename: the name of the file to summaries
        :return: a sumamry of the paper.
        """

        # Each item has form (sentence, sentence_vector, abstract_vector, features)
        paper = self.prepare_paper(filename)
        # ========> Code from here on is summariser specific <========
        graph1 = tf.get_default_graph()
        with tf.Session() as sess:

            # Initialise all variables
            sess.run(tf.global_variables_initializer())

            # Saving object
            #saver = tf.train.Saver()
            saver = tf.train.import_meta_graph(SAVE_PATH + 'model-200.meta')
            module_file = tf.train.latest_checkpoint(SAVE_PATH)

            # Restore the saved model
            saver.restore(sess, module_file)  #,module_file)#, SAVE_PATH)

            # Stores sentences, the probability of them being good summaries and their position in the paper
            sentences_and_summary_probs = []

            # Number of sentences in the paper
            num_sents = len(paper)

            # ----> Create the matrix for sentences for the LSTM <----
            sentence_list = []

            for sent, sent_vec, abs_vec, feats in paper:
                if len(sent) < MAX_SENT_LEN:
                    sentence_list.append(sent)
                else:
                    sentence_list.append(sent[0:MAX_SENT_LEN])

            # Get the matrix representation of the sentences
            sentence_matrix, sent_lens = sents2input(sentence_list, num_sents)

            # ----> Create the matrix for abstracts for the LSTM <----
            abstract_matrix = np.zeros((num_sents, ABSTRACT_DIMENSION),
                                       dtype=np.float32)

            i = 0
            for _, _, abs_vec, _ in paper:
                abstract_matrix[i, :] = abs_vec
                i += 1

            # ----> Create the matrix of features for the LSTM <----
            feature_matrix = np.zeros((num_sents, NUM_FEATURES),
                                      dtype=np.float32)

            i = 0
            for _, _, _, feat in paper:
                feature_matrix[i, :] = feat
                i += 1

            #Write OUTFILE for summarunner
            with open(SUMM_SOURCE + filename, 'w') as OUTFILE:
                for i in range(num_sents):
                    OUTFILE.write(" ".join(word for word in paper[i][0]))
                    OUTFILE.write("\n")

            # Create the feed_dict
            feed_x = summarunner_datareader.get_input_tensor(SUMM_SOURCE +
                                                             filename)
            #print(self.prediction_probs,feed_x)

            input_x = graph1.get_operation_by_name("inputs/x_input").outputs[0]
            self.prediction_probs = graph1.get_operation_by_name(
                "score_layer/prediction").outputs[0]

            # Predict how good a summary each sentence is using the computation graph
            probs = np.random.random(0)
            for x in feed_x:
                probs = np.append(
                    sess.run(self.prediction_probs,
                             feed_dict={input_x: x.reshape(40, 100)}), probs)

            # Store the sentences and probabilities in a list to be sorted
            for i in range(num_sents):
                sentence = paper[i][0]
                sentence_vec = paper[i][1]
                prob = probs[i]
                sentences_and_summary_probs.append(
                    (sentence, sentence_vec, prob, i))

            # This list is now sorted by the probability of the sentence being a good summary sentence
            sentences_and_summary_probs = [
                x for x in reversed(
                    sorted(sentences_and_summary_probs, key=itemgetter(2)))
            ]

            summary = []
            for sent, sent_vec, prob, pos in sentences_and_summary_probs:
                if len(summary) > self.summary_length:
                    break

                if len(sent) < 10:
                    continue
                else:
                    summary.append((sent, sent_vec, prob, pos))

            #summary = sentences_and_summary_probs[0:self.summary_length]

            # Order sumamry sentences according to the order they appear in the paper
            ordered_summary = sorted(summary, key=itemgetter(-1))

            # Print the summary
            summary = []

            for sentence, sentence_vec, prob, pos in ordered_summary:
                sentence = " ".join(sentence)
                summary.append((sentence, pos))
        #print("calling write_summary..")
        useful_functions.write_summary(SUMMARY_WRITE_LOC, summary,
                                       filename.strip(".txt"))

        #for sentence in summary:
        #    print(sentence)
        #    print()

    def load_model(self):
        """
        Loads the classification model
        :return: the classification model
        """
        pass

    def prepare_paper(self, filename):
        """
        Prepares the paper for summarisation.
        :return: The paper in a form suitable for summarisation
        """
        paper = self.preprocessor.prepare_for_summarisation(filename)
        return paper