def extract_ngrams2(self, concept_type='ngrams', n=2):
        """Extract the ngrams of words from the input sentences.

        Args:
            n (int): the number of words for ngrams, defaults to 2
        """
        for i, sentence in enumerate(self.sentences):
            untokenized_concepts = []
            if concept_type == 'ngrams':
                ngrams = extract_ngrams2([sentence.untokenized_form],
                                         self.stemmer, self.LANGUAGE, n)
                pruned_list = prune_ngrams(ngrams, self.stoplist, n)
            elif concept_type == 'phrase':
                pruned_list = self.sentences[i].phrases

            for concept in pruned_list:
                wrds = unstem_ngram(concept, sentence)
                untokenized_concepts.append(" ".join(wrds))

            self.sentences[i].concepts = pruned_list
            self.sentences[i].untokenized_concepts = untokenized_concepts
            #print(untokenized_concepts)
            if len(self.sentences[i].concepts) != len(
                    self.sentences[i].untokenized_concepts):
                raise BaseException(
                    "unexpected length difference between concepts and untokenized_concepts"
                )
    def solve_ilp(self, N):
        # build the A matrix: a_ij is 1 if j-th gram appears in the i-th sentence

        A = np.zeros((len(self.sentences_idx), len(self.ref_ngrams_idx)))
        for i in self.sentences_idx:
            sent = self.sentences[i].untokenized_form
            sngrams = list(extract_ngrams2([sent], self.stemmer, self.LANGUAGE, N))
            for j in self.ref_ngrams_idx:
                if self.ref_ngrams[j] in sngrams:
                    A[i][j] = 1

        # Define ILP variable, x_i is 1 if sentence i is selected, z_j is 1 if gram j appears in the created summary
        x = pulp.LpVariable.dicts('sentences', self.sentences_idx, lowBound=0, upBound=1, cat=pulp.LpInteger)
        z = pulp.LpVariable.dicts('grams', self.ref_ngrams_idx, lowBound=0, upBound=1, cat=pulp.LpInteger)

        # Define ILP problem, maximum coverage of grams from the reference summaries
        prob = pulp.LpProblem("ExtractiveUpperBound", pulp.LpMaximize)
        prob += sum(z[j] for j in self.ref_ngrams_idx)

        # Define ILP constraints, length constraint and consistency constraint (impose that z_j is 1 if j
        # appears in the created summary)
        prob += sum(x[i] * self.sentences[i].length for i in self.sentences_idx) <= self.sum_length

        for j in self.ref_ngrams_idx:
            prob += sum(A[i][j] * x[i] for i in self.sentences_idx) >= z[j]

        # Solve ILP problem and post-processing to get the summary
        prob.solve(pulp.GLPK(msg=0))

        summary_idx = []
        for idx in self.sentences_idx:
            if x[idx].value() == 1.0:
                summary_idx.append(idx)

        return summary_idx
    def extract_ngrams2(self, concept_type='ngrams', n=2):
        """Extract the ngrams of words from the input sentences.

        Args:
            n (int): the number of words for ngrams, defaults to 2
        """
        for i, sentence in enumerate(self.sentences):
            if concept_type == 'ngrams':
                ngrams = extract_ngrams2([sentence.untokenized_form], self.stemmer, self.LANGUAGE, n)
                pruned_list = prune_ngrams(ngrams, self.stoplist, n)
            elif concept_type == 'phrase':
                pruned_list = self.sentences[i].phrases
                
            self.sentences[i].concepts = pruned_list
Beispiel #4
0
    def __init__(self, rouge, models, parse_info, language, stemmer, summary_length=100, N=2, stopwords=None,
                 ub_score=None,
                 ub_summary=None, summarizer=None, parser_type=None):
        self.rouge = rouge
        self.models = models
        self.language = language
        self.stopwords = stopwords or Set()
        self.summary_length = summary_length

        self.ref_ngrams = Set()  # set of ngrams that are in the reference summaries (for the feedback to peek)
        self.ref_phrases = Set()  # set of phrases that are in the reference summaries (for the feedback to peek)
        self.__ub_summary__ = ub_summary or []
        self.__ub_score__ = ub_score or (0.0, 0.0, 0.0)

        # this only deals with the reference summaries
        parse_info = parse_info or []
        for model_name, model in models:
            y = Set(extract_ngrams2(model, stemmer, language, N))
            self.ref_ngrams = self.ref_ngrams.union(y)
            if parser_type == PARSE_TYPE_PARSE:
                for _, parse_sents in parse_info[1]:
                    for parse_sent in parse_sents:
                        _, phrases = get_parse_info(parse_sent, stemmer, language,
                                                    stopwords)
                        y = Set(prune_phrases(phrases, stopwords, stemmer, language))
                        self.ref_phrases = self.ref_phrases.union(y)

        if summarizer is not None:
            if parser_type is None or parser_type == PARSE_TYPE_NGRAMS:
                concept_match = [key for key in summarizer.weights if key in self.ref_ngrams]
                log.debug('Total uniq ref concepts (ngr):   %s' % (len(self.ref_ngrams)))
            elif parser_type == PARSE_TYPE_PARSE:
                concept_match = [key for key in summarizer.weights if key in self.ref_phrases]
                log.debug('Total uniq ref concepts (phr):   %s' % (len(self.ref_phrases)))
            else:
                raise ValueError("parse_type '%s' is invalid, should be %s or %s" %
                                 (parser_type, None, PARSE_TYPE_PARSE))
            log.debug('UB Accept concepts:  %s' % (len(concept_match)))
 def get_ref_ngrams(self, N):
     for _, summary in self.models:
         self.ref_ngrams.extend(extract_ngrams2(summary, self.stemmer, self.LANGUAGE, N))
    def __call__(self,
                 docs,
                 models,
                 summary_length,
                 oracle_type,
                 ub_score,
                 ub_summary,
                 parser_type=None,
                 parse_info=[],
                 max_iteration_count=11,
                 weights_override={},
                 clear_before_override=None,
                 propagation=False):
        """
        This starts of the simualted feedback for a single cluster of documents, towards a list of models. i.e. the
        models get united, and then the feedback loop is simulated.

        :param docs:
        :param models:
        :param summary_length:
        :param oracle_type:
        :param ub_score:
        :param ub_summary:
        :param parser_type:
        :param parse_info:
        :param max_iteration_count: int: Maximum number of iterations to run.
        :param weights_override: dict: (concept -> double) dictionary containing the override weights for propagation
        """

        self.models = models
        self.summary_length = summary_length
        self.ub_score = ub_score
        self.parse_type = parser_type
        self.cluster_size = len(docs)
        self.MAX_WEIGHT = len(docs)

        for model_name, model in models:
            y = set(extract_ngrams2(model, self.stemmer, self.language,
                                    self.N))
            self.ref_ngrams = self.ref_ngrams.union(y)
            if parser_type == PARSE_TYPE_PARSE:
                for _, parse_sents in parse_info[1]:
                    for parse_sent in parse_sents:
                        _, phrases = get_parse_info(parse_sent, self.stemmer,
                                                    self.language,
                                                    self.stoplist)
                        y = set(
                            prune_phrases(phrases, self.stoplist, self.stemmer,
                                          self.language))
                        self.ref_phrases = self.ref_phrases.union(y)

        self.summarizer.sentences = self.SumeWrap.load_sume_sentences(
            docs, parser_type, parse_info)
        parse_info = []

        # extract bigrams as concepts
        if self.parse_type == PARSE_TYPE_PARSE:
            print('Get concept types Phrases')
            self.summarizer.extract_ngrams2(concept_type='phrase')
        if self.parse_type == None:
            print('Get concept types ngrams')
            self.summarizer.extract_ngrams2(concept_type='ngrams')

        # compute document frequency as concept weights
        self.summarizer.compute_document_frequency()

        # compute word_frequency
        self.summarizer.compute_word_frequency()

        old_sentences = self.summarizer.sentences

        self.summarizer.prune_sentences(remove_citations=True,
                                        remove_redundancy=True,
                                        imp_list=[])

        # from all concepts that are going to be pruned, keep only those that also appear elsewhere

        retained_concepts = [
            concept for s in self.summarizer.sentences
            for concept in s.concepts
        ]

        print('Total concepts before sentence pruning: ',
              len(self.summarizer.weights))

        for sentence in set(old_sentences).difference(
                self.summarizer.sentences):
            for concept in sentence.concepts:
                if concept not in retained_concepts and self.summarizer.weights.has_key(
                        concept):
                    del self.summarizer.weights[concept]

        print('Total concepts found: ', len(self.summarizer.weights))

        if self.parse_type == None:
            concept_match = [
                key for key in self.summarizer.weights
                if key in self.ref_ngrams
            ]
            print('Total ref concepts:   ', len(self.ref_ngrams))
        elif self.parse_type == PARSE_TYPE_PARSE:
            concept_match = [
                key for key in self.summarizer.weights
                if key in self.ref_phrases
            ]
            print('Total ref concepts:   ', len(self.ref_phrases))
        print('UB Accept concepts:   ', len(concept_match))

        if oracle_type.startswith(ORACLE_TYPE_ACTIVE_LEARNING):
            self.get_feature_vector()
            self.data = np.array(self.fvector)
            model = svm.SVC(kernel='linear',
                            C=1.0,
                            probability=True,
                            class_weight='balanced')

        self.initial_weights = self.summarizer.weights

        self.__apply_initial_weights_override__(weights_override,
                                                clear_before_override)
        '''
        # create the coocurence graph
        self.graph.clear()
        self.graph.add_sentences(self.summarizer.sentences)
        dump_dir=tempfile.mkdtemp(dir=self.debug_dump_target_dir)
        '''

        print('Summarizing %s sentences down to %s words' %
              (len(self.summarizer.sentences), self.summary_length))
        # core algorithm for feedback calculation... (as in paper)
        flag = 0
        # get_details is the personalizedSummary function which gets updated weights in every iteration.
        # Starting with boudin as starting weights (except in case of weights_override != None).

        # initial iteration
        summary, self.score, subset = self.get_details(1, summary_length,
                                                       oracle_type)
        self.prev_score = (0.0, 0.0, 0.0)
        prev_summary = ''
        for iteration in range(2, max_iteration_count):
            self.dump_current_weight_map(self.debug_dump_target_dir,
                                         max_iteration_count)
            # here, depending on the oracle_type, a intermediate summary is generated. This intermediate summary is
            # satisfies other optimization criteria, so that the amount/probability of getting useful feedback is maximized
            if iteration > 2:
                subset = self.__generate_optimal_feedback_summary__(
                    flag, oracle_type, summary_length)

            print('Summary Subset:', subset)

            # acquire feedback and record it using the flight_recorder
            #new_accepts, new_rejects, new_implicits = self.get_feedback(subset, RECOMMENDER_METHOD_HIGHEST_WEIGHT)
            new_accepts, new_rejects, new_implicits = self.get_feedback(subset)
            self.flight_recorder.record(new_accepts, new_rejects,
                                        new_implicits)

            # update the summarizer weights for next iteration
            self.recalculate_weights(oracle_type, propagation)

            summary, self.score, _ = self.get_details(iteration,
                                                      summary_length,
                                                      oracle_type)

            if oracle_type.startswith(ORACLE_TYPE_ACTIVE_LEARNING):
                self.uncertainity, self.labels = self.get_uncertainity_labels(
                    model)

            if self.check_break_condition(iteration, prev_summary, summary,
                                          ub_summary, self.prev_score):
                break

            self.prev_score = self.score
            prev_summary = summary

        return summary
Beispiel #7
0
def plot_ngrams():
    data_path = "%s/data" % (path.dirname(path.dirname(
        path.abspath(__file__))))
    summary_len, data_set, language = get_args()
    stemmer = SnowballStemmer(language)

    reader = CorpusReader(data_path)
    data = reader.get_data(data_set, summary_len)

    for topic, docs, models in data:

        print topic
        summarizer = ExtractiveUpperbound(language)
        ub_summary = summarizer(docs, models, summary_len, ngram_type=2)

        summarizer = SumeWrap(language)
        summarizer.s.sentences = summarizer.load_sume_sentences(docs)
        summarizer.s.extract_ngrams2()
        summarizer.s.compute_document_frequency()

        sorted_list = get_sorted(summarizer.s.weights)

        ngrams_ub = extract_ngrams2(ub_summary, stemmer, language)
        ngrams_models = []
        for _, model in models:
            ngrams_models.append(extract_ngrams2(model, stemmer, language))

        inter_ngrams = []
        for i in range(len(ngrams_models)):
            for j in range(i + 1, len(ngrams_models)):
                inter_ngrams.extend(
                    Set(ngrams_models[i]).intersection(Set(ngrams_models[j])))

        final_ngrams_models = []
        for i in range(len(ngrams_models)):
            final_ngrams_models.append(
                list(Set(ngrams_models[i]) - Set(inter_ngrams)))

        all_ngrams_unique = []
        for ngrams in ngrams_models:
            all_ngrams_unique = list(Set(all_ngrams_unique).union(Set(ngrams)))

        all_ngrams = []
        for ngrams in ngrams_models:
            all_ngrams.extend(ngrams)
        all_ngrams = list(Set(all_ngrams))

        final_ngrams_models.append(all_ngrams)

        x = [0]
        y = [[0] for _ in range(len(final_ngrams_models))]

        for i in range(50, len(sorted_list), 10):
            docs_ngrams = sorted_list[:i]
            x.append(i)
            prev_y = 0
            for index in range(len(final_ngrams_models)):
                val = len(ngrams_match(docs_ngrams,
                                       final_ngrams_models[index]))
                if index == len(final_ngrams_models) - 1:
                    y[index].append(val)
                else:
                    y[index].append(prev_y + val)
                prev_y += val
        plt.fill_between(x, [0] * len(y[0]),
                         y[0],
                         facecolor='green',
                         interpolate=True)
        plt.plot(x, y[0], 'g', label='Unique bigrams by User 1')
        plt.fill_between(x, y[0], y[1], facecolor='blue', interpolate=True)
        plt.plot(x, y[1], 'r', label='Unique bigrams by User 2')
        plt.fill_between(x, y[1], y[2], facecolor='red', interpolate=True)
        plt.plot(x, y[2], 'b', label='Unique bigrams by User 3')
        plt.fill_between(x, y[2], y[3], facecolor='yellow', interpolate=True)
        plt.plot(x, y[3], 'y', label='Unique bigrams by User 4')
        plt.fill_between(x, y[3], y[4], facecolor='black', interpolate=True)
        plt.plot(x,
                 y[4],
                 'k',
                 label='Overlapping bigrams between atleast two Users')

        #plt.plot(x, y[5], 'k', label='Upper Bound')

        plt.legend(loc="upper left", fontsize=10)
        plt.xlabel("No. of sorted bigrams in the source documents",
                   fontsize=20)
        plt.ylabel("Overlapping w.r.t. reference summaries", fontsize=20)
        plt.yscale("linear", linewidth=1)
        plt.grid(True)
        plt.show()