Exemple #1
0
 def tokenize(self):
     vocab = Counter()
     for mail in self.mails:
         mail.sents = list(iterTokenizedSentences(mail.sujet)) + list(iterTokenizedSentences(mail.description))
         vocab.update(word for sent in mail.sents for word in sent)
     self.vocab = vocab
    "contact-nom",
    "contact-email",
    "date-candidature",
    "validite",
    "duree",
    "ville",
    "lieu",
    "labo",
]

outdir = "archives_SFBI_AnnotationManuelle"

mails = list(mailLoaderGen())
words = Counter()
for mail in mails:
    mail.sents = list(iterTokenizedSentences(mail.description))
    for sent in mail.sents:
        words.update(sent)

stemmer = Stemmer(set(word for (word, n) in words.items() if n > 10))

for m in mails:
    outf = outdir + m.mailfile.strip("archives_SFBI")
    d = m.__dict__
    d["date"] = date.fromtimestamp(d["timestamp"]).strftime("%d %B %Y")

    with open(outf, "wt") as f:
        d["from"] = d.pop("sender")
        if m.sfbi:
            ce = d["contact-email"]
            ce = "\t".join(ce) if type(ce) is set else ce