def tokenize(self): vocab = Counter() for mail in self.mails: mail.sents = list(iterTokenizedSentences(mail.sujet)) + list(iterTokenizedSentences(mail.description)) vocab.update(word for sent in mail.sents for word in sent) self.vocab = vocab
"contact-nom", "contact-email", "date-candidature", "validite", "duree", "ville", "lieu", "labo", ] outdir = "archives_SFBI_AnnotationManuelle" mails = list(mailLoaderGen()) words = Counter() for mail in mails: mail.sents = list(iterTokenizedSentences(mail.description)) for sent in mail.sents: words.update(sent) stemmer = Stemmer(set(word for (word, n) in words.items() if n > 10)) for m in mails: outf = outdir + m.mailfile.strip("archives_SFBI") d = m.__dict__ d["date"] = date.fromtimestamp(d["timestamp"]).strftime("%d %B %Y") with open(outf, "wt") as f: d["from"] = d.pop("sender") if m.sfbi: ce = d["contact-email"] ce = "\t".join(ce) if type(ce) is set else ce