def summarize(self, msgs, range_spec=None): """Return a summary of the text TODO: 1. Looks like spacy is not getting the main sentence from the message. 2. Load times for the spacy summarizer won't cut it. Commenting out now until this can be fixed """ size = range_spec['size'] if range_spec and 'size' in range_spec else 3 if not msgs or len(msgs) == 0: self.logger.warn("No messages to form summary") return u"\n Unable to form summary here.\n" txt = range_spec['txt'] if range_spec else u'Summary is' if range_spec: self.logger.info("First 10 messages %s of %s", msgs[:10], len(msgs)) self.logger.info("Using time range spec %s", range_spec) start_time = time.strptime( range_spec['start'], "%B %d %Y") if 'start' in range_spec else ts_to_time( min(msgs, key=lambda m: m['ts'])['ts']) self.logger.info("Start time is %s", start_time) delt = tspec_to_delta(**range_spec) end_time = start_time + delt self.logger.info("End time is %s", end_time) msgs = [ msg for msg in msgs if ts_to_time(msg['ts']) >= start_time and ts_to_time(msg['ts']) <= end_time ] self.logger.info("First 10 messages %s of %s", msgs[:10], len(msgs)) summ = txt + u' ' summ_list = [] can_dict = {canonicalize(get_msg_text(msg)): msg for msg in msgs} top_keys = sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True) can_dict = {key: can_dict[key] for key in top_keys} self.logger.info("Length of can_dict is %s", len(can_dict)) simple_sum_list = [ can_dict[ss] for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:size] ] simple_sum = u'\n'.join([ self.tagged_sum(can_dict[ss]) for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:size] ]) #simple_sum = u'\n'.join([self.tagged_sum(ss) for ss in simple_sum_list]) assert (len(simple_sum_list) <= size) #simple_sum = self.tagged_sum(can_dict[max(can_dict.keys(), key=lambda x: len(x))]) if len(msgs) < 10: #return the longest summ += u'\n'.join([ self.tagged_sum(ss) for ss in sorted(simple_sum_list, key=lambda x: x['ts']) ]) else: max_sents = {} user_sents = {} for (txt, msg) in can_dict.items(): if len(txt.split()) > 3: sl = list(self.sumr.nlp(txt).sents) max_sents[max(sl, key=lambda x: len(x)).text] = msg user_sents[max(sl, key=lambda x: len(x)). text] = msg['user'] if 'user' in msg else u'' txt_sum = [ v for v in self.sumr(u' '.join(max_sents.keys()), size, user_sents) ] self.logger.info("Canonical keys are \n%s", u' '.join(can_dict.keys())) self.logger.info("Spacy summ %s", txt_sum) nlp_summ = u'\n'.join([ self.tagged_sum(max_sents[ss]) for ss in txt_sum if len(ss) > 1 and ss in max_sents ]) nlp_list = [ max_sents[ss] for ss in txt_sum if len(ss) > 1 and ss in max_sents ] for ss in txt_sum: if ss not in max_sents and len(ss.split()) > 5: self.logger.info("Searching for: %s", ss) for (ky, msg) in max_sents.items(): if ss in ky or (len(ky.split()) > 10 and ky in ss) and len(nlp_list) <= size: nlp_summ += u'\n' + self.tagged_sum(msg) nlp_list.append(msg) if len(nlp_list) < 2: self.logger.info("Failed to find nlp summary using heuristic") summ += u'\n'.join([ self.tagged_sum(ss) for ss in sorted(simple_sum_list, key=lambda x: x['ts']) ]) else: self.logger.info("First msg is %s, %s", nlp_list[0], nlp_list[0]['ts']) self.logger.info("Sorted is %s", sorted(nlp_list, key=lambda x: x['ts'])) summ += u'\n'.join([ self.tagged_sum(ss) for ss in sorted(nlp_list, key=lambda x: x['ts']) ]) self.logger.info("Summary for segment %s is %s", msgs, summ) return summ
def test_interval_conversion(self): self.assertTrue(ts_to_time("1441925382.000186") == datetime.utcfromtimestamp(1441925382))
def test_interval_conversion(self): self.assertTrue( ts_to_time("1441925382.000186") == datetime.utcfromtimestamp( 1441925382))
def summarize(self, msgs, range_spec=None): """Return a summary of the text TODO: 1. Looks like spacy is not getting the main sentence from the message. 2. Load times for the spacy summarizer won't cut it. Commenting out now until this can be fixed """ size = range_spec['size'] if range_spec and 'size' in range_spec else 3 if not msgs or len(msgs) == 0: self.logger.warn("No messages to form summary") return u"\n Unable to form summary here.\n" txt = range_spec['txt'] if range_spec else u'Summary is' if range_spec: self.logger.info("First 10 messages %s of %s", msgs[:10], len(msgs)) self.logger.info("Using time range spec %s", range_spec) start_time = time.strptime(range_spec['start'], "%B %d %Y") if 'start' in range_spec else ts_to_time(min(msgs, key=lambda m: m['ts'])['ts']) self.logger.info("Start time is %s", start_time) delt = tspec_to_delta(**range_spec) end_time = start_time + delt self.logger.info("End time is %s", end_time) msgs = [msg for msg in msgs if ts_to_time(msg['ts']) >= start_time and ts_to_time(msg['ts']) <= end_time] self.logger.info("First 10 messages %s of %s", msgs[:10], len(msgs)) summ = txt + u' ' summ_list = [] can_dict = {canonicalize(get_msg_text(msg)) : msg for msg in msgs} top_keys = sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True) can_dict = {key: can_dict[key] for key in top_keys} self.logger.info("Length of can_dict is %s", len(can_dict)) simple_sum_list = [can_dict[ss] for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:size]] simple_sum = u'\n'.join([self.tagged_sum(can_dict[ss]) for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:size]]) #simple_sum = u'\n'.join([self.tagged_sum(ss) for ss in simple_sum_list]) assert(len(simple_sum_list) <= size) #simple_sum = self.tagged_sum(can_dict[max(can_dict.keys(), key=lambda x: len(x))]) if len(msgs) < 10: #return the longest summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(simple_sum_list, key=lambda x: x['ts'])]) else: max_sents = {} user_sents = {} for (txt, msg) in can_dict.items(): if len(txt.split()) > 3: sl = list(self.sumr.nlp(txt).sents) max_sents[max(sl, key = lambda x: len(x)).text] = msg user_sents[max(sl, key = lambda x: len(x)).text] = msg['user'] if 'user' in msg else u'' txt_sum = [v for v in self.sumr(u' '.join(max_sents.keys()), size, user_sents)] self.logger.info("Canonical keys are \n%s", u' '.join(can_dict.keys())) self.logger.info("Spacy summ %s", txt_sum) nlp_summ = u'\n'.join([self.tagged_sum(max_sents[ss]) for ss in txt_sum if len(ss) > 1 and ss in max_sents]) nlp_list = [max_sents[ss] for ss in txt_sum if len(ss) > 1 and ss in max_sents] for ss in txt_sum: if ss not in max_sents and len(ss.split()) > 5: self.logger.info("Searching for: %s", ss) for (ky, msg) in max_sents.items(): if ss in ky or (len(ky.split()) > 10 and ky in ss) and len(nlp_list) <= size: nlp_summ += u'\n' + self.tagged_sum(msg) nlp_list.append(msg) if len(nlp_list) < 2: self.logger.info("Failed to find nlp summary using heuristic") summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(simple_sum_list, key=lambda x: x['ts'])]) else: self.logger.info("First msg is %s, %s", nlp_list[0], nlp_list[0]['ts']) self.logger.info("Sorted is %s", sorted(nlp_list, key=lambda x: x['ts'])) summ += u'\n'.join([self.tagged_sum(ss) for ss in sorted(nlp_list, key=lambda x: x['ts'])]) self.logger.info("Summary for segment %s is %s", msgs, summ) return summ