def heuristic_one(self):
        """ This function takes a document and splits it by topic.
        
            It weights sentences towards the beginning of a paragraph more
            than the sentences towards the end of the paragraph since
            most articles are written to where the beginning sentences
            are more information heavy.

            The total heurstic value for all of the sentences combined
            for each segment in the article is about 0.10. It never
            fully reaches 0.10 though, and is often times much less
            ~0.75.  
            
            Updates the scores array.
        """
        i = 0  # index of each sentence
        for seg in self.segments:
            sentences = split_sentences(seg)
            j = 0
            weight = HEURONE_WEIGHT
            for sents in sentences:
                #print j, sents
                w = weight / 2
                #print "WEIGHT ", i, " : ", w
                self.scores.update_score(i, w)

                weight = w
                j += 1
                i += 1
def create_segment_boundaries(segments):
    """ Determines which index ranges correspond to each segment within
        the article.
        
        Returns an array comprised of pairs where each pair represents
        a lower and upper boundary for each topic segment of the article.
        Where the lower boundary is inclusive and the upper boundary is 
        exclusive. [lower, upper).
        
        If a sentence with index i falls within one of the ranges, that
        means it is within that segment.
    """
    
    i = 0
    j = 0
    boundaries = [0] * len(segments)
    for seg in segments:
        lower = i
        sentences = split_sentences(seg)
        for sent in sentences:
            i += 1
        upper = i
        pair = (lower, upper)
        boundaries[j] = pair
        j += 1
        
    return boundaries
Beispiel #3
0
def get_number_sents(length, fil):
    """ This function determines the number of sentences that will be in
        the summary based on the percentage specified by the user. 
    """
    s = len(split_sentences(fil))
    num = (float(length) / 100) * s
    return int(round(num))
 def segment_heur_two(self):
     """ This function finds segments that only consist of of
         one sentence (NOTE:: maybe make this 2 in the future??)
     """
     j = 0
     for seg in self.segments:
         i = 0
         sentences = split_sentences(seg)
         for sent in sentences:
             i += 1
         if (i < 2):
             self.seg_scores.update_seg_score(j, -SEG_HEURTWO_WEIGHT)
         j += 1
def run(numsents, fil, sentscores, segscores):
    """ This function is what is called in order to selected the sentences 
        that will make it to the final summary.
        
        It selects from multiple 'segments' of the article in order
        to create a summary which captures all aspects of the original 
        document.
    """
    segments = split_segments(fil)
    sents = split_sentences(fil)
    boundaries = create_segment_boundaries(segments)
    #ordered_sentscores = get_maxscore_ordering(sentscores, sents)
    ordered_segscores = get_maxscore_ordering(segscores, segments)
    final_sentences = select_sentences(boundaries, numsents, ordered_segscores, sentscores, segments)
    print_summary(final_sentences, sents)
 def __init__(self, fil):
     self.fil = fil
     self.sentences = split_sentences(self.fil)
     self.num_sentences = sentence_count(self.sentences)
     self.scores = Score(self.num_sentences)
     self.segments = split_segments(self.fil)