def get_message_weight(self,body):
        body_tdm = SimpleTermDocumentMatrix()
        body_tdm.add_doc(body)
        body_terms = body_tdm.get_terms(min_doc_freq=1)

        tm = self.tdm.get_term_freq()

        for body_term in body_terms:
            tm_log10 = [math.log10(tm[k]) for k in tm if body_term == k]

        mean =  sum(tm_log10)/len(tm_log10) if len(tm_log10) > 0 else 1
        return mean
def get_tdm(email_path) :

    if os.path.isdir(email_path):
        files = [ email_path + "/" + f for f in os.listdir(email_path) if os.path.isfile(os.path.join(email_path,f))]
    else:
        files = [email_path]
    tdm = SimpleTermDocumentMatrix()

    for f in files:
        body = get_body(f)
        tdm.add_doc(body)

    return tdm
    def train(self,files):


        senders_map = {}
        threads = {}
        tdm = SimpleTermDocumentMatrix()

        for f in files:

            l = parse_email(f)
            sender = l[0]
            subject = l[1]
            date_ = l[2]
            body = l[3]

            tdm.add_doc(body)
            self.calc_sender_freq(sender,senders_map)
            self.calc_weight_on_thread(subject,sender,date_,threads)

        self.senders_freq = senders_map
        self.threads_weight = threads
        self.tdm = tdm