def accumulate_body(self): """ returns bodystring extracted from this mail """ #TODO: don't hardcode which part is considered body but allow toggle # commands and a config default setting return extract_body(self.get_email())
def get_email(self): path = self.message.get_filename() warning = "Subject: Caution!\n"\ "Message file is no longer accessible:\n%s" % path if not self._email: try: with open(path) as f: self._email = email.message_from_file(f) except IOError: self._email = email.message_from_string(warning) return extract_body(self._email)
def get_text_content(self): return extract_body(self.get_email(), types=['text/plain'])
def accumulate_body(self): """ returns bodystring extracted from this mail """ #TODO: allow toggle commands to decide which part is considered body return extract_body(self.get_email())
def read_data(self, name, type = 1): #print "extract: " + self.path_data + name ruta = "" if type == 1: ruta = self.path_data + "train/" + name elif type == 2: ruta = self.path_data + "test/" + name elif type == 3: ruta = self.path_data + "first3/" + name reuters = et.parse(ruta, et.XMLParser(encoding='ISO-8859-1')).getroot() extract_labels = False #print reuters #for reuters in xml.findall('REUTERS'): # print reuters matrix = [] for text in reuters.findall("TEXT"): body = utils.extract_body(text) if body != "" and body != None: extract_labels = True #if extract_labels == True: labels_temp = np.zeros(config.label_size) all_labels = 0 for a_topic in reuters.findall("TOPICS"): for a_d in a_topic.findall("D"): try: label_index = utils.find_label_index(a_d.text) labels_temp[label_index] = 1.0 self.label_examples[label_index] += 1 all_labels += 1 except ValueError: extract_labels = True for a_topic in reuters.findall("PLACES"): for a_d in a_topic.findall("D"): try: label_index = utils.find_label_index(a_d.text) labels_temp[label_index] = 1.0 self.label_examples[label_index] += 1 all_labels += 1 except ValueError: extract_labels = True for a_topic in reuters.findall("PEOPLE"): for a_d in a_topic.findall("D"): try: label_index = utils.find_label_index(a_d.text) labels_temp[label_index] = 1.0 self.label_examples[label_index] += 1 all_labels += 1 except ValueError: extract_labels = True for a_topic in reuters.findall("ORGS"): for a_d in a_topic.findall("D"): try: label_index = utils.find_label_index(a_d.text) labels_temp[label_index] = 1.0 self.label_examples[label_index] += 1 all_labels += 1 except ValueError: extract_labels = True for a_topic in reuters.findall("EXCHANGES"): for a_d in a_topic.findall("D"): try: label_index = utils.find_label_index(a_d.text) labels_temp[label_index] = 1.0 self.label_examples[label_index] += 1 all_labels += 1 except ValueError: extract_labels = True if all_labels != 0: #print("READ...") self.labels_train = np.append(self.labels_train, labels_temp) self.texts_train = np.append(self.texts_train, utils.stop_characters(body.text)) extract_labels = False else: extract_labels = False