def heuristic_one(self): """ This function takes a document and splits it by topic. It weights sentences towards the beginning of a paragraph more than the sentences towards the end of the paragraph since most articles are written to where the beginning sentences are more information heavy. The total heurstic value for all of the sentences combined for each segment in the article is about 0.10. It never fully reaches 0.10 though, and is often times much less ~0.75. Updates the scores array. """ i = 0 # index of each sentence for seg in self.segments: sentences = split_sentences(seg) j = 0 weight = HEURONE_WEIGHT for sents in sentences: #print j, sents w = weight / 2 #print "WEIGHT ", i, " : ", w self.scores.update_score(i, w) weight = w j += 1 i += 1
def create_segment_boundaries(segments): """ Determines which index ranges correspond to each segment within the article. Returns an array comprised of pairs where each pair represents a lower and upper boundary for each topic segment of the article. Where the lower boundary is inclusive and the upper boundary is exclusive. [lower, upper). If a sentence with index i falls within one of the ranges, that means it is within that segment. """ i = 0 j = 0 boundaries = [0] * len(segments) for seg in segments: lower = i sentences = split_sentences(seg) for sent in sentences: i += 1 upper = i pair = (lower, upper) boundaries[j] = pair j += 1 return boundaries
def get_number_sents(length, fil): """ This function determines the number of sentences that will be in the summary based on the percentage specified by the user. """ s = len(split_sentences(fil)) num = (float(length) / 100) * s return int(round(num))
def segment_heur_two(self): """ This function finds segments that only consist of of one sentence (NOTE:: maybe make this 2 in the future??) """ j = 0 for seg in self.segments: i = 0 sentences = split_sentences(seg) for sent in sentences: i += 1 if (i < 2): self.seg_scores.update_seg_score(j, -SEG_HEURTWO_WEIGHT) j += 1
def run(numsents, fil, sentscores, segscores): """ This function is what is called in order to selected the sentences that will make it to the final summary. It selects from multiple 'segments' of the article in order to create a summary which captures all aspects of the original document. """ segments = split_segments(fil) sents = split_sentences(fil) boundaries = create_segment_boundaries(segments) #ordered_sentscores = get_maxscore_ordering(sentscores, sents) ordered_segscores = get_maxscore_ordering(segscores, segments) final_sentences = select_sentences(boundaries, numsents, ordered_segscores, sentscores, segments) print_summary(final_sentences, sents)
def __init__(self, fil): self.fil = fil self.sentences = split_sentences(self.fil) self.num_sentences = sentence_count(self.sentences) self.scores = Score(self.num_sentences) self.segments = split_segments(self.fil)