def loadMails(self): obj2id = self.obj2id mails = [] for mail in mailLoaderGen(self.archivedir): #tags = set(getattr(mail, 'tags', []) + getattr(mail, 'type', [])) if hasattr(mail, 'tags'): alltags.update(mail.tags) if hasattr(mail, 'type'): alltags[mail.type] += 1 #mail.tags = [obj2id.add(Tag(tag)) for tag in mail.tags] mail.id = obj2id.add(mail) mails.append(mail) self.mails = mails
"date", "tags", "sfbi_url", "contact-nom", "contact-email", "date-candidature", "validite", "duree", "ville", "lieu", "labo", ] outdir = "archives_SFBI_AnnotationManuelle" mails = list(mailLoaderGen()) words = Counter() for mail in mails: mail.sents = list(iterTokenizedSentences(mail.description)) for sent in mail.sents: words.update(sent) stemmer = Stemmer(set(word for (word, n) in words.items() if n > 10)) for m in mails: outf = outdir + m.mailfile.strip("archives_SFBI") d = m.__dict__ d["date"] = date.fromtimestamp(d["timestamp"]).strftime("%d %B %Y") with open(outf, "wt") as f: d["from"] = d.pop("sender")