def ExtractThreads(message_infos): thread_messages = [] for message_info in message_infos: try: thread_message = jwzthreading.make_message(message_info.headers) except ValueError: continue if thread_message: thread_message.message_info = message_info thread_messages.append(thread_message) thread_dict = jwzthreading.thread(thread_messages) containers = [] for subject, container in thread_dict.items(): # jwzthreading is too aggressive in threading by subject and will combine # distinct threads that happen to have the same subject. Split them up if # we have a dummy container that has lots of children at the first # level. if container.is_dummy() and len(container.children) >= 10: for child_container in container.children: child_container.subject = subject containers.append(child_container) else: container.subject = subject containers.append(container) return containers
def test_basic_message(self): msg = message_from_string("""Subject: random Message-ID: <message1> References: <ref1> <ref2> <ref1> In-Reply-To: <reply> Body.""") m = jwzthreading.make_message(msg) self.assertTrue(repr(m)) self.assertEquals(m.subject, 'random') self.assertEquals(sorted(m.references), ['ref1', 'ref2', 'reply']) # Verify that repr() works repr(m)
def thread_mails(emails): #print('Threading...') emails_for_threading = [] for mail in emails: email_for_threading = jwzthreading.make_message(email.message_from_string(mail.imported_blob)) #Store our emailsubject, jwzthreading does not decode subject itself email_for_threading.subject = mail.subject.first_original().value #Store our email object pointer instead of the raw message text email_for_threading.message = mail emails_for_threading.append(email_for_threading) threaded_emails = jwzthreading.thread(emails_for_threading) # Output L = threaded_emails.items() L.sort() for subj, container in L: jwzthreading.print_container(container, 0, True) def update_threading(threaded_emails, parent=None, debug=False): if debug: print "\n\nEntering update_threading() for %s mails:" % len(threaded_emails) for container in threaded_emails: if debug: #jwzthreading.print_container(container) print("\nProcessing: " + repr(container.message.subject.first_original().value) + " " + repr(container.message.message_id)+ " " + repr(container.message.message.id)) print "container: " + (repr(container)) print "parent: " + repr(container.parent) print "children: " + repr(container.children) if(container.message): current_parent = container.message.message.parent if(current_parent): db_parent_message_id = current_parent.message_id else: db_parent_message_id = None if parent: if parent.message: #jwzthreading strips the <>, re-add them algorithm_parent_message_id = unicode("<"+parent.message.message_id+">") else: if debug: print "Parent was a dummy container, we may need \ to handle this case better, as we just \ potentially lost sibbling relationships" algorithm_parent_message_id = None else: algorithm_parent_message_id = None if debug: print("Current parent from database: " + repr(db_parent_message_id)) print("Current parent from algorithm: " + repr(algorithm_parent_message_id)) print("References: " + repr(container.message.references)) if algorithm_parent_message_id != db_parent_message_id: if current_parent == None or isinstance(current_parent, Email): if debug: print("UPDATING PARENT for :" + repr(container.message.message.message_id)) new_parent = parent.message.message if algorithm_parent_message_id else None if debug: print repr(new_parent) container.message.message.set_parent(new_parent) else: if debug: print "Skipped reparenting: the current parent \ isn't an email, the threading algorithm only \ considers mails" update_threading(container.children, container, debug=debug) else: if debug: print "Current message ID: None, was a dummy container" update_threading(container.children, parent, debug=debug) update_threading(threaded_emails.values(), debug=False)
tex = filter_text(text) print sorted(set(tex.split(' '))),len(set(tex.split(' '))) write_to_orig_html_file(sentences) words = nltk.wordpunct_tokenize(tex) locs = search_dict(words) write_to_mod_html_file(sentences,locs,tex) print sorted(report_words), len(report_words) build_dic_words() files = glob.glob('/root/bngbirds-data/bngbirds/*.eml') msglist = [] for file in files[:10]: fp = open(file,'r') msg = email.message_from_file(fp) m = jwz.make_message(msg,file) msglist.append(m) fp.close() subject_table = jwz.thread(msglist) L = subject_table.items() L.sort() for subj, container in L: sent=defaultdict(lambda: 0) depth=0 process_single_message(container,depth,sent)
def thread_mails(emails): # print('Threading...') emails_for_threading = [] for mail in emails: email_for_threading = jwzthreading.make_message( email.message_from_string(mail.imported_blob)) # Store our emailsubject, jwzthreading does not decode subject itself email_for_threading.subject = mail.subject.first_original().value # Store our email object pointer instead of the raw message text email_for_threading.message = mail emails_for_threading.append(email_for_threading) threaded_emails = jwzthreading.thread(emails_for_threading) # Output L = threaded_emails.items() L.sort() for subj, container in L: jwzthreading.print_container(container, 0, True) def update_threading(threaded_emails, parent=None, debug=False): if debug: print("\n\nEntering update_threading() for %s mails:" % len(threaded_emails)) for container in threaded_emails: if debug: # jwzthreading.print_container(container) print("\nProcessing: " + repr( container.message.subject.first_original().value) + " " + repr(container.message.message_id) + " " + repr(container.message.message.id)) print("container: " + (repr(container))) print("parent: " + repr(container.parent)) print("children: " + repr(container.children)) if (container.message): current_parent = container.message.message.parent if (current_parent): db_parent_message_id = current_parent.message_id else: db_parent_message_id = None if parent: if parent.message: # jwzthreading strips the <>, re-add them algorithm_parent_message_id = unicode( "<" + parent.message.message_id + ">") else: if debug: print( "Parent was a dummy container, we may need \ to handle this case better, as we just \ potentially lost sibbling relationships") algorithm_parent_message_id = None else: algorithm_parent_message_id = None if debug: print("Current parent from database: " + repr(db_parent_message_id)) print("Current parent from algorithm: " + repr(algorithm_parent_message_id)) print("References: " + repr(container.message.references)) if algorithm_parent_message_id != db_parent_message_id: if current_parent is None or isinstance( current_parent, Email): if debug: print( "UPDATING PARENT for :" + repr(container.message.message.message_id)) new_parent = parent.message.message if algorithm_parent_message_id else None if debug: print(repr(new_parent)) container.message.message.set_parent(new_parent) else: if debug: print( "Skipped reparenting: the current parent \ isn't an email, the threading algorithm only \ considers mails") update_threading(container.children, container, debug=debug) else: if debug: print( "Current message ID: None, was a dummy container") update_threading(container.children, parent, debug=debug) update_threading(threaded_emails.values(), debug=False)
def thread_mails(emails): print('Threading...') emails_for_threading = [] for mail in emails: email_for_threading = jwzthreading.make_message(email.message_from_string(mail.full_message)) #Store our emailsubject, jwzthreading does not decode subject itself email_for_threading.subject = mail.subject #Store our email object pointer instead of the raw message text email_for_threading.message = mail emails_for_threading.append(email_for_threading) threaded_emails = jwzthreading.thread(emails_for_threading) # Output L = threaded_emails.items() L.sort() for subj, container in L: jwzthreading.print_container(container, 0, True) def update_threading(threaded_emails, parent=None): for container in threaded_emails: #jwzthreading.print_container(container) #print (repr(container)) ##print "parent: "+repr(container.parent) ##print "children: "+repr(container.children) ##print("\nProcessing: " + repr(container.message.subject) + " " + repr(container.message.message_id)) if(container.message): current_parent = container.message.message.post.parent if(current_parent): db_parent_message_id = current_parent.content.message_id else: db_parent_message_id = None if parent: if parent.message: #jwzthreading strips the <>, re-add them algorithm_parent_message_id = unicode("<"+parent.message.message_id+">") else: # Parent was a dummy container, we may need to handle this case better # we just potentially lost sibbling relationships algorithm_parent_message_id = None else: algorithm_parent_message_id = None #print("Current parent from algorithm: " + repr(algorithm_parent_message_id)) #print("References: " + repr(container.message.references)) if algorithm_parent_message_id != db_parent_message_id: # Don't reparent if the current parent isn't an email, the threading algorithm only considers mails if current_parent == None or isinstance(current_parent.content, Email): #print("UPDATING PARENT for :" + repr(container.message.message.message_id)) new_parent = parent.message.message.post if algorithm_parent_message_id else None #print repr(new_parent) container.message.message.post.set_parent(new_parent) if current_parent and current_parent.content.source_id != container.message.message.source_id: #This is to correct past mistakes in the database, remove it once everyone ran it benoitg 2013-11-20 print("UPDATING PARENT, BAD ORIGINAL SOURCE" + repr(current_parent.content.source_id) + " " + repr(container.message.message.source_id)) new_parent = parent.message.message.post if algorithm_parent_message_id else None #print repr(new_parent) container.message.message.post.set_parent(new_parent) update_threading(container.children, container) else: #print "Current message ID: None, was a dummy container" update_threading(container.children, parent) update_threading(threaded_emails.values())
def thread_mails(emails): #log.debug('Threading...') emails_for_threading = [] for mail in emails: blob = mail.imported_blob if not isinstance(blob, native_str): blob = blob.decode('ascii') email_for_threading = jwzthreading.make_message( email.message_from_string(blob)) #Store our emailsubject, jwzthreading does not decode subject itself email_for_threading.subject = mail.subject.first_original().value #Store our email object pointer instead of the raw message text email_for_threading.message = mail emails_for_threading.append(email_for_threading) threaded_emails = jwzthreading.thread(emails_for_threading) # Output L = list(threaded_emails.items()) L.sort() for subj, container in L: jwzthreading.print_container(container, 0, True) def update_threading(threaded_emails, parent=None, debug=False): log.debug("\n\nEntering update_threading() for %ld mails:" % len(threaded_emails)) for container in threaded_emails: # if debug: #jwzthreading.print_container(container) message_string = "%s %s %d " % ( container.message.subject, container.message.message_id, container.message.message.id ) if container.message else "null " log.debug( "Processing: %s container: %s parent: %s children :%s" % (message_string, container, container.parent, container.children)) if (container.message): current_parent = container.message.message.parent if (current_parent): db_parent_message_id = current_parent.message_id else: db_parent_message_id = None if parent: if parent.message: #jwzthreading strips the <>, re-add them algorithm_parent_message_id = u"<" + parent.message.message_id + u">" else: log.warn( "Parent was a dummy container, we may need " "to handle this case better, as we just " "potentially lost sibling relationships") algorithm_parent_message_id = None else: algorithm_parent_message_id = None log.debug("Current parent from database: " + repr(db_parent_message_id)) log.debug("Current parent from algorithm: " + repr(algorithm_parent_message_id)) log.debug("References: " + repr(container.message.references)) if algorithm_parent_message_id != db_parent_message_id: if current_parent == None or isinstance( current_parent, Email): log.debug( "UPDATING PARENT for :" + repr(container.message.message.message_id)) new_parent = parent.message.message if algorithm_parent_message_id else None log.debug(repr(new_parent)) container.message.message.set_parent(new_parent) else: log.debug( "Skipped reparenting: the current parent " "isn't an email, the threading algorithm only " "considers mails") update_threading(container.children, container, debug=debug) else: log.debug( "Current message ID: None, was a dummy container") update_threading(container.children, parent, debug=debug) update_threading(list(threaded_emails.values()), debug=False)
# for text in to_text_list(ctr.message.message): # # print h2t.html2text(text.replace('\r\n', '\n').replace('=\n', '')) # print text # if raw_input().strip() == 'q': return for c in ctr.children: print_container(c, depth + 1, tiddler_name) print('Reading input file...') if False: mbox_path = '/Volumes/ramdisk/ccmt6.mbox' # with open(mbox_path, 'rb') as ifile: # mbox = mailbox.UnixMailbox(ifile) mbox = mailbox.mbox(mbox_path) mlist = list(mbox) msglist = [make_message(m) for m in mlist] else: eml_path = '/tmp/ccmt-1' mlist = [ mailbox.mboxMessage(open(eml).read()) for eml in glob(pjoin(eml_path, '*.eml')) ] msglist = [make_message(m) for m in mlist] print('Threading...') subject_table = thread(msglist) # Output L = subject_table.items() L.sort() for subj, container in L: