def iter_threads(self): table = self.parser.select(self.document.getroot(), 'table#listeMessages', 1) for tr in table.xpath('./tr'): if tr.attrib.get('class', '') not in ('msgLu', 'msgNonLu'): continue author = unicode(self.parser.select(tr, 'td.colEmetteur', 1).text) link = self.parser.select(tr, 'td.colObjet a', 1) date_raw = self.parser.select(tr, 'td.colDate1', 1).attrib['data'] jsparams = re.search('\((.+)\)', link.attrib['onclick']).groups()[0] jsparams = [i.strip('\'" ') for i in jsparams.split(',')] page_id, _id, unread = jsparams # this means unread on the website unread = False if unread == "false" else True # 2012/02/29:01h30min45sec dt_match = re.match('(\d+)/(\d+)/(\d+):(\d+)h(\d+)min(\d+)sec', date_raw).groups() dt_match = [int(d) for d in dt_match] thread = Thread(_id) thread._link_id = (page_id, unread) thread.date = datetime(*dt_match) thread.title = unicode(link.text) message = Message(thread, 0) message.set_empty_fields(None) message.flags = message.IS_HTML message.title = thread.title message.date = thread.date message.sender = author message.content = NotLoaded # This is the only thing we are missing thread.root = message yield thread
def iter_threads(self): table = self.parser.select(self.document.getroot(), "table#listeMessages", 1) for tr in table.xpath("./tr"): if tr.attrib.get("class", "") not in ("msgLu", "msgNonLu"): continue author = unicode(self.parser.select(tr, "td.colEmetteur", 1).text) link = self.parser.select(tr, "td.colObjet a", 1) date_raw = self.parser.select(tr, "td.colDate1", 1).attrib["data"] jsparams = re.search("\((.+)\)", link.attrib["onclick"]).groups()[0] jsparams = [i.strip("'\" ") for i in jsparams.split(",")] page_id, _id, unread = jsparams # this means unread on the website unread = False if unread == "false" else True # 2012/02/29:01h30min45sec dt_match = re.match("(\d+)/(\d+)/(\d+):(\d+)h(\d+)min(\d+)sec", date_raw).groups() dt_match = [int(d) for d in dt_match] thread = Thread(_id) thread._link_id = (page_id, unread) thread.date = datetime(*dt_match) thread.title = unicode(link.text) message = Message(thread, 0) message.set_empty_fields(None) message.flags = message.IS_HTML message.title = thread.title message.date = thread.date message.sender = author message.content = NotLoaded # This is the only thing we are missing thread.root = message yield thread
def build_date(self, k): ret = Message(id='%s.%s' % k) ret.title = '%s/%s' % k ret.content = '' ret.date = date(int(k[0]), int(k[1]), 1) ret._type = 'date' ret._key = k return ret
def make_message(self, d, thread): msg = Message(thread, d['id']) msg.children = [] msg.sender = d['from'] msg.flags = 0 msg.title = d['subject'] msg.date = d['datetime'] msg.receivers = [d['to']] return msg
def build_article(self, j): m = self.article.match(j['url']) ret = Message(id=m.group('title')) ret.title = j['title'] ret.url = j['url'] ret.flags = Message.IS_HTML ret.date = date(int(m.group('year')), int(m.group('month')), 1) ret.children = [] ret._type = 'article' return ret
def make_message(self, d, thread): m = Message(thread, d['id']) m.children = [] m.sender = d['from'] m.flags = 0 if not d.get('read', True): m.flags = m.IS_UNREAD m.title = d['subject'] m.date = d['datetime'] m.receivers = [d['to']] return m
def _get_messages(self, thread): thread_div = self.document.find(True, 'PADpost_txt') used_ids = set() rcpt = self.document.find('input', attrs={'type': 'hidden', 'name': 'Dest'})['value'] sender_to_receiver = {rcpt: self.browser.username, self.browser.username: rcpt} # site is sorted from latest to oldest message for message_table in reversed(thread_div.findAll('table')): for td in message_table.findAll('td'): profile_a = td.find('a', href=re.compile(r'profil_read.php\?.*')) if not profile_a: continue first_br = td.find('br') assert first_br.nextSibling.name == 'br' text_nodes = ovsparse.all_next_siblings(first_br.nextSibling.nextSibling) # TODO #~ print text_nodes # date will be used as id sitedate = profile_a.findParent('div').find(text=re.compile(',.*')).replace(', ', '') sysdate = parse_french_date(sitedate) compactdate = datetime.datetime.strftime(sysdate, '%Y%m%dT%H%M%S') # but make it unique msg_id = ovsparse.create_unique_id(compactdate, used_ids) used_ids.add(msg_id) message = Message(thread, msg_id) message.sender = re.search(r'\?(.+)', profile_a['href']).group(1) message.receivers = [sender_to_receiver[message.sender]] message.date = sysdate message.content = ovsparse.html_message_to_text(text_nodes) notread_self = bool(td.find('span', 'ColorSurligne')) notread_other = bool(td.find('span', 'new_sortiepartenaire')) if notread_other or notread_self: message.flags |= Message.IS_NOT_RECEIVED else: message.flags |= Message.IS_RECEIVED yield message
def _get_messages(self, thread): thread_div = self.document.find(True, "PADpost_txt") used_ids = set() rcpt = self.document.find("input", attrs={"type": "hidden", "name": "Dest"})["value"] sender_to_receiver = {rcpt: self.browser.username, self.browser.username: rcpt} # site is sorted from latest to oldest message for message_table in reversed(thread_div.findAll("table")): for td in message_table.findAll("td"): profile_a = td.find("a", href=re.compile(r"profil_read.php\?.*")) if not profile_a: continue first_br = td.find("br") assert first_br.nextSibling.name == "br" text_nodes = ovsparse.all_next_siblings(first_br.nextSibling.nextSibling) # TODO # ~ print text_nodes # date will be used as id sitedate = profile_a.findParent("div").find(text=re.compile(",.*")).replace(", ", "") sysdate = parse_french_date(sitedate) compactdate = datetime.datetime.strftime(sysdate, "%Y%m%dT%H%M%S") # but make it unique msg_id = ovsparse.create_unique_id(compactdate, used_ids) used_ids.add(msg_id) message = Message(thread, msg_id) message.sender = re.search(r"\?(.+)", profile_a["href"]).group(1) message.receivers = [sender_to_receiver[message.sender]] message.date = sysdate message.content = ovsparse.html_message_to_text(text_nodes) notread_self = bool(td.find("span", "ColorSurligne")) notread_other = bool(td.find("span", "new_sortiepartenaire")) if notread_other or notread_self: message.flags |= Message.IS_NOT_RECEIVED else: message.flags |= Message.IS_RECEIVED yield message