Example #1
0
    def get_thread_mails(self):
        mails = {
            'member' : {},
            'messages' : [],
        }

        try:
            mails['member']['pseudo'] = self.parser.tocleanstring(self.document.getroot().cssselect('div#message_heading div.username span.name')[0])
        except IndexError:
            mails['member']['pseudo'] = 'Unknown'

        for li in reversed(self.document.xpath('//ul[@id="thread"]//li[contains(@id, "message_")]')):
            try:
                txt = self.parser.tostring(li.xpath('.//div[@class="message_body"]')[0])
            except IndexError:
                continue # 'Match' message
            txt = html2text(txt).strip()

            m = re.search(r'(\d+), ', li.xpath('.//span[@class="timestamp"]//script')[0].text)
            assert m
            date = local2utc(datetime.fromtimestamp(int(m.group(1))))

            id_from = li.find('a').attrib['href'].split('/')[-1].split('?')[0]

            mails['messages'].append({
                'date' : date,
                'message' : unicode(txt),
                'id_from' : unicode(id_from),
            })

        return mails
Example #2
0
 def parse_date(self, date_s):
     date_s = date_s.strip().encode('utf-8')
     if not date_s:
         date = datetime.now()
     else:
         date = datetime.strptime(date_s, u'le %d/%m/%Y \xe0 %H:%M.'.encode('utf-8'))
     return local2utc(date)
Example #3
0
    def parse(self):
        self.url = '%s#%s' % (self.preurl, self.div.attrib['id'])
        self.title = unicode(self.browser.parser.select(self.div.find('h2'), 'a.title', 1).text)
        try:
            a = self.browser.parser.select(self.div.find('p'), 'a[rel=author]', 1)
        except BrokenPageError:
            self.author = 'Anonyme'
            self.username = None
        else:
            self.author = unicode(a.text)
            self.username = unicode(a.attrib['href'].split('/')[2])
        self.date = datetime.strptime(self.browser.parser.select(self.div.find('p'), 'time', 1).attrib['datetime'].split('+')[0],
                                      '%Y-%m-%dT%H:%M:%S')
        self.date = local2utc(self.date)

        content = self.div.find('div')
        try:
            signature = self.browser.parser.select(content, 'p.signature', 1)
        except BrokenPageError:
            # No signature.
            pass
        else:
            content.remove(signature)
            self.signature = self.browser.parser.tostring(signature)
        self.body = self.browser.parser.tostring(content)

        self.score = int(self.browser.parser.select(self.div.find('p'), 'span.score', 1).text)
        forms = self.browser.parser.select(self.div.find('footer'), 'form.button_to')
        if len(forms) > 0:
            self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against')
            self.relevance_token = self.browser.parser.select(forms[0], 'input[name=authenticity_token]', 1).attrib['value']
Example #4
0
    def __init__(self, browser, url, tree):
        Content.__init__(self, browser)
        self.url = url
        self.id = url2id(self.url)

        if tree is None:
            return

        header = tree.find('header')
        self.title = u' — '.join([a.text for a in header.find('h1').findall('a')])
        try:
            a = self.browser.parser.select(header, 'a[rel=author]', 1)
        except BrokenPageError:
            self.author = 'Anonyme'
            self.username = None
        else:
            self.author = unicode(a.text)
            self.username = unicode(a.attrib['href'].split('/')[2])
        self.body = self.browser.parser.tostring(self.browser.parser.select(tree, 'div.content', 1))
        try:
            self.date = datetime.strptime(self.browser.parser.select(header, 'time', 1).attrib['datetime'].split('+')[0],
                                          '%Y-%m-%dT%H:%M:%S')
            self.date = local2utc(self.date)
        except BrokenPageError:
            pass
        for form in self.browser.parser.select(tree.find('footer'), 'form.button_to'):
            if form.attrib['action'].endswith('/for'):
                self.relevance_url = form.attrib['action'].rstrip('for').rstrip('against')
                self.relevance_token = self.browser.parser.select(form, 'input[name=authenticity_token]', 1).attrib['value']

        self.score = int(self.browser.parser.select(tree, 'div.figures figure.score', 1).text)
Example #5
0
def parse_date(s):
    s = s.replace(u'Fév', 'Feb') \
         .replace(u'Avr', 'Apr') \
         .replace(u'Mai', 'May') \
         .replace(u'Juin', 'Jun') \
         .replace(u'Juil', 'Jul') \
         .replace(u'Aoû', 'Aug') \
         .replace(u'Ao\xfbt', 'Aug') \
         .replace(u'Déc', 'Dec')
    return local2utc(_parse_dt(s))
Example #6
0
def parse_date(s):
    s = s.replace(u'Fév', 'Feb') \
         .replace(u'Avr', 'Apr') \
         .replace(u'Mai', 'May') \
         .replace(u'Juin', 'Jun') \
         .replace(u'Juil', 'Jul') \
         .replace(u'Aoû', 'Aug') \
         .replace(u'Ao\xfbt', 'Aug') \
         .replace(u'Déc', 'Dec')
    return local2utc(_parse_dt(s))
Example #7
0
def parse_dt(s):
    now = datetime.datetime.now()
    if s is None:
        return local2utc(now)
    if 'minutes ago' in s:
        m = int(s.split()[0])
        d = now - datetime.timedelta(minutes=m)
    elif u'–' in s:
        # Date in form : "Yesterday – 20:45"
        day, hour = s.split(u'–')
        day = day.strip()
        hour = hour.strip()
        if day == 'Yesterday':
            d = now - datetime.timedelta(days=1)
        elif day == 'Today':
            d = now
        hour = _parse_dt(hour)
        d = datetime.datetime(d.year, d.month, d.day, hour.hour, hour.minute)
    else:
        #if ',' in s:
        # Date in form : "Dec 28, 2011")
        d = _parse_dt(s)
    return local2utc(d)
Example #8
0
def parse_dt(s):
    now = datetime.datetime.now()
    if s is None:
        return local2utc(now)
    if "minutes ago" in s:
        m = int(s.split()[0])
        d = now - datetime.timedelta(minutes=m)
    elif u"–" in s:
        # Date in form : "Yesterday – 20:45"
        day, hour = s.split(u"–")
        day = day.strip()
        hour = hour.strip()
        if day == "Yesterday":
            d = now - datetime.timedelta(days=1)
        elif day == "Today":
            d = now
        hour = _parse_dt(hour)
        d = datetime.datetime(d.year, d.month, d.day, hour.hour, hour.minute)
    else:
        # if ',' in s:
        # Date in form : "Dec 28, 2011")
        d = _parse_dt(s)
    return local2utc(d)
Example #9
0
    def parse(self):
        self.url = '%s#%s' % (self.preurl, self.div.attrib['id'])
        self.title = unicode(
            self.browser.parser.select(self.div.find('h2'), 'a.title', 1).text)
        try:
            a = self.browser.parser.select(self.div.find('p'), 'a[rel=author]',
                                           1)
        except BrokenPageError:
            self.author = 'Anonyme'
            self.username = None
        else:
            self.author = unicode(a.text)
            self.username = unicode(a.attrib['href'].split('/')[2])
        self.date = datetime.strptime(
            self.browser.parser.select(self.div.find('p'), 'time',
                                       1).attrib['datetime'].split('+')[0],
            '%Y-%m-%dT%H:%M:%S')
        self.date = local2utc(self.date)

        content = self.div.find('div')
        try:
            signature = self.browser.parser.select(content, 'p.signature', 1)
        except BrokenPageError:
            # No signature.
            pass
        else:
            content.remove(signature)
            self.signature = self.browser.parser.tostring(signature)
        self.body = self.browser.parser.tostring(content)

        self.score = int(
            self.browser.parser.select(self.div.find('p'), 'span.score',
                                       1).text)
        forms = self.browser.parser.select(self.div.find('footer'),
                                           'form.button_to')
        if len(forms) > 0:
            self.relevance_url = forms[0].attrib['action'].rstrip(
                'for').rstrip('against')
            self.relevance_token = self.browser.parser.select(
                forms[0], 'input[name=authenticity_token]', 1).attrib['value']
Example #10
0
    def __init__(self, browser, url, tree):
        Content.__init__(self, browser)
        self.url = url
        self.id = url2id(self.url)

        if tree is None:
            return

        header = tree.find('header')
        self.title = u' — '.join(
            [a.text for a in header.find('h1').xpath('.//a')])
        try:
            a = self.browser.parser.select(header, 'a[rel=author]', 1)
        except BrokenPageError:
            self.author = 'Anonyme'
            self.username = None
        else:
            self.author = unicode(a.text)
            self.username = unicode(a.attrib['href'].split('/')[2])
        self.body = self.browser.parser.tostring(
            self.browser.parser.select(tree, 'div.content', 1))
        try:
            self.date = datetime.strptime(
                self.browser.parser.select(header, 'time',
                                           1).attrib['datetime'].split('+')[0],
                '%Y-%m-%dT%H:%M:%S')
            self.date = local2utc(self.date)
        except BrokenPageError:
            pass
        for form in self.browser.parser.select(tree.find('footer'),
                                               'form.button_to'):
            if form.attrib['action'].endswith('/for'):
                self.relevance_url = form.attrib['action'].rstrip(
                    'for').rstrip('against')
                self.relevance_token = self.browser.parser.select(
                    form, 'input[name=authenticity_token]', 1).attrib['value']

        self.score = int(
            self.browser.parser.select(tree, 'div.figures figure.score',
                                       1).text)
Example #11
0
def parse_dt(s):
    d = _parse_dt(s)
    return local2utc(d)
Example #12
0
def parse_dt(s):
    d = _parse_dt(s)
    return local2utc(d)