Beispiel #1
0
    def format_obj(self, obj, alias):
        result = u'%sTitle:%s %s\n' % (self.BOLD,
                                       self.NC, obj.title)
        result += u'%sDate:%s %s\n' % (self.BOLD,
                                       self.NC, obj.date.strftime('%Y-%m-%d %H:%M'))
        result += u'%sFrom:%s %s\n' % (self.BOLD,
                                       self.NC, obj.sender)
        if hasattr(obj, 'receivers') and obj.receivers:
            result += u'%sTo:%s %s\n' % (self.BOLD,
                                         self.NC,
                                         ', '.join(obj.receivers))

        if obj.flags & Message.IS_HTML:
            content = html2text(obj.content)
        else:
            content = obj.content

        result += '\n%s' % content

        if obj.signature:
            if obj.flags & Message.IS_HTML:
                signature = html2text(obj.signature)
            else:
                signature = obj.signature

            result += '\n-- \n%s' % signature
        return result
Beispiel #2
0
    def format_obj(self, obj, alias):
        result = u'%sTitle:%s %s\n' % (self.BOLD,
                                       self.NC, obj.title)
        result += u'%sDate:%s %s\n' % (self.BOLD,
                                       self.NC, obj.date.strftime('%Y-%m-%d %H:%M'))
        result += u'%sFrom:%s %s\n' % (self.BOLD,
                                       self.NC, obj.sender)
        if hasattr(obj, 'receivers') and obj.receivers:
            result += u'%sTo:%s %s\n' % (self.BOLD,
                                         self.NC,
                                         ', '.join(obj.receivers))

        if obj.flags & Message.IS_HTML:
            content = html2text(obj.content)
        else:
            content = obj.content

        result += '\n%s' % content

        if obj.signature:
            if obj.flags & Message.IS_HTML:
                signature = html2text(obj.signature)
            else:
                signature = obj.signature

            result += '\n-- \n%s' % signature
        return result
Beispiel #3
0
 def filter(self, el):
     _resume = el[0].xpath("p[@data-rel='full-resume']")
     if not _resume:
         _resume = el[0].xpath("p[@data-rel='small-resume']")
         if _resume:
             resume = html2text(CleanText(_resume[0])(self))[6:]
             return resume
Beispiel #4
0
    def do_status(self, line):
        """
        status

        Display status information about a backend.
        """
        if len(line) > 0:
            backend_name = line
        else:
            backend_name = None

        results = {}
        for field in self.do('get_account_status',
                             backends=backend_name,
                             caps=CapAccount):
            if field.backend in results:
                results[field.backend].append(field)
            else:
                results[field.backend] = [field]

        for name, fields in results.iteritems():
            print(':: %s ::' % name)
            for f in fields:
                if f.flags & f.FIELD_HTML:
                    value = html2text(f.value)
                else:
                    value = f.value
                print('%s: %s' % (f.label, value))
            print('')
Beispiel #5
0
 def read_renew(self, id):
     for tr in self.document.getroot().xpath('//tr[@class="patFuncEntry"]'):
         if len(tr.xpath('td/input[@value="%s"]' % id)) > 0:
             message = self.browser.parser.tostring(tr.xpath('td[@class="patFuncStatus"]')[0])
             renew = Renew(id)
             renew.message = html2text(message).replace('\n', '')
             return renew
Beispiel #6
0
    def fill_gallery(self, gallery):
        gallery.title = self.document.xpath("//h1[@id='gn']/text()")[0]
        try:
            gallery.original_title = self.document.xpath("//h1[@id='gj']/text()")[0]
        except IndexError:
            gallery.original_title = None
        description_div = self.document.xpath("//div[@id='gd71']")[0]
        description_html = self.parser.tostring(description_div)
        gallery.description = html2text(description_html)
        cardinality_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Images:']/td[@class='gdt2']/text()")[0]
        gallery.cardinality = int(re.match(r"\d+", cardinality_string).group(0))
        date_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Posted:']/td[@class='gdt2']/text()")[0]
        gallery.date = datetime.strptime(date_string, "%Y-%m-%d %H:%M")
        rating_string = self.document.xpath("//td[@id='rating_label']/text()")[0]
        rating_match = re.search(r"\d+\.\d+", rating_string)
        gallery.rating = None if rating_match is None else float(rating_match.group(0))
        gallery.rating_max = 5

        try:
            thumbnail_url = self.document.xpath("//div[@class='gdtm']/a/img/attribute::src")[0]
        except IndexError:
            thumbnail_style = self.document.xpath("//div[@class='gdtm']/div/attribute::style")[0]
            thumbnail_url = re.search(r"background:[^;]+url\((.+?)\)", thumbnail_style).group(1)

        gallery.thumbnail = BaseImage(thumbnail_url)
        gallery.thumbnail.url = gallery.thumbnail.id
Beispiel #7
0
    def fill_gallery(self, gallery):
        gallery.title = self.document.xpath("//h1[@id='gn']/text()")[0]
        try:
            gallery.original_title = self.document.xpath("//h1[@id='gj']/text()")[0]
        except IndexError:
            gallery.original_title = None
        description_div = self.document.xpath("//div[@id='gd71']")[0]
        description_html = self.parser.tostring(description_div)
        gallery.description = html2text(description_html)
        cardinality_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Images:']/td[@class='gdt2']/text()")[0]
        gallery.cardinality = int(re.match(r"\d+", cardinality_string).group(0))
        date_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Posted:']/td[@class='gdt2']/text()")[0]
        gallery.date = datetime.strptime(date_string, "%Y-%m-%d %H:%M")
        rating_string = self.document.xpath("//td[@id='rating_label']/text()")[0]
        rating_match = re.search(r"\d+\.\d+", rating_string)
        if rating_match is None:
            gallery.rating = None
        else:
            gallery.rating = float(rating_match.group(0))

        gallery.rating_max = 5

        try:
            thumbnail_url = self.document.xpath("//div[@class='gdtm']/a/img/attribute::src")[0]
        except IndexError:
            thumbnail_style = self.document.xpath("//div[@class='gdtm']/div/attribute::style")[0]
            thumbnail_url = re.search(r"background:[^;]+url\((.+?)\)", thumbnail_style).group(1)

        gallery.thumbnail = BaseImage(thumbnail_url)
        gallery.thumbnail.url = gallery.thumbnail.id
Beispiel #8
0
    def set_video_metadata(self, video):

        head = self.parser.select(self.document.getroot(), 'head', 1)

        video.title = unicode(self.parser.select(head, 'meta[property="og:title"]', 1).get("content")).strip()
        video.author = unicode(self.parser.select(head, 'meta[name="author"]', 1).get("content")).strip()

        url = unicode(self.parser.select(head, 'meta[property="og:image"]', 1).get("content")).strip()
        # remove the useless anti-caching
        url = re.sub('\?\d+', '', url)
        video.thumbnail = BaseImage(url)
        video.thumbnail.url = video.thumbnail.id

        try:
            parts = self.parser.select(head, 'meta[property="video:duration"]', 1).get("content").strip().split(':')
        except BrokenPageError:
            # it's probably a live, np.
            video.duration = NotAvailable
        else:
            if len(parts) == 1:
                seconds = parts[0]
                hours = minutes = 0
            elif len(parts) == 2:
                minutes, seconds = parts
                hours = 0
            elif len(parts) == 3:
                hours, minutes, seconds = parts
            else:
                raise BrokenPageError('Unable to parse duration %r' % parts)
            video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))

        try:
            video.description = html2text(self.parser.select(head, 'meta[property="og:description"]', 1).get("content")).strip() or unicode()
        except BrokenPageError:
            video.description = u''
Beispiel #9
0
    def get_job_advert(self, url, advert):
        re_id = re.compile(
            "http://www.adecco.fr/trouver-un-emploi/Pages/Details-de-l-Offre/(.*?)/(.*?).aspx\?IOF=(.*?)$", re.DOTALL
        )
        if advert is None:
            _id = u"%s/%s/%s" % (re_id.search(url).group(1), re_id.search(url).group(2), re_id.search(url).group(3))
            advert = AdeccoJobAdvert(_id)

        advert.contract_type = re_id.search(url).group(1)
        div = self.document.getroot().xpath("//div[@class='contain_MoreResults']")[0]

        date = u"%s" % self.parser.select(div, "div[@class='dateResult']", 1, method="xpath").text.strip()
        m = re.match("(\d{2})\s(.*?)\s(\d{4})", date)
        if m:
            dd = int(m.group(1))
            mm = MONTHS.index(m.group(2)) + 1
            yyyy = int(m.group(3))
            advert.publication_date = datetime.date(yyyy, mm, dd)

        title = self.parser.select(div, "h1", 1, method="xpath").text_content().strip()
        town = self.parser.select(div, "h1/span/span[@class='town']", 1, method="xpath").text_content()
        page_title = self.parser.select(div, "h1/span[@class='pageTitle']", 1, method="xpath").text_content()
        advert.title = u"%s" % title.replace(town, "").replace(page_title, "")

        spans = self.document.getroot().xpath("//div[@class='jobGreyContain']/table/tr/td/span[@class='value']")
        advert.job_name = u"%s" % spans[0].text
        advert.place = u"%s" % spans[1].text
        advert.pay = u"%s" % spans[2].text
        advert.contract_type = u"%s" % spans[3].text
        advert.url = url
        description = self.document.getroot().xpath("//div[@class='descriptionContainer']/p")[0]
        advert.description = html2text(self.parser.tostring(description))
        return advert
Beispiel #10
0
 def test_lefigaro(self):
     self.backend.RSS_FEED = "http://www.lefigaro.fr/rss/figaro_%s.xml" % self.backend.config['feed'].get()
     l = list(self.backend.iter_threads())
     assert len(l)
     thread = self.backend.get_thread(l[0].id)
     assert len(thread.root.content)
     assert len(html2text(thread.root.content))
Beispiel #11
0
 def format_obj(self, obj, alias):
     result = u'%s %s %s %s %s\n' % (self.colored(obj.project.name, 'blue', 'bold'),
                                     self.colored(u'—', 'cyan', 'bold'),
                                     self.colored(obj.fullid, 'red', 'bold'),
                                     self.colored(u'—', 'cyan', 'bold'),
                                     self.colored(obj.title, 'yellow', 'bold'))
     result += '\n%s\n\n' % obj.body
     result += self.format_key('Author', '%s (%s)' % (obj.author.name, obj.creation))
     result += self.format_attr(obj, 'status')
     result += self.format_attr(obj, 'priority')
     result += self.format_attr(obj, 'version')
     result += self.format_attr(obj, 'tracker')
     result += self.format_attr(obj, 'category')
     result += self.format_attr(obj, 'assignee')
     if hasattr(obj, 'fields') and not empty(obj.fields):
         for key, value in obj.fields.iteritems():
             result += self.format_key(key.capitalize(), value)
     if hasattr(obj, 'attachments') and obj.attachments:
         result += '\n%s\n' % self.colored('Attachments:', 'green')
         for a in obj.attachments:
             result += '* %s%s%s <%s>\n' % (self.BOLD, a.filename, self.NC, a.url)
     if hasattr(obj, 'history') and obj.history:
         result += '\n%s\n' % self.colored('History:', 'green')
         for u in obj.history:
             result += '%s %s %s %s\n' % (self.colored('*', 'red', 'bold'),
                                          self.colored(u.date, 'yellow', 'bold'),
                                          self.colored(u'—', 'cyan', 'bold'),
                                          self.colored(u.author.name, 'blue', 'bold'))
             for change in u.changes:
                 result += '  - %s %s %s %s\n' % (self.colored(change.field, 'green'),
                                                  change.last,
                                                  self.colored('->', 'magenta'), change.new)
             if u.message:
                 result += '    %s\n' % html2text(u.message).strip().replace('\n', '\n    ')
     return result
Beispiel #12
0
    def get_thread_mails(self):
        mails = {
            'member' : {},
            'messages' : [],
        }

        try:
            mails['member']['pseudo'] = self.parser.tocleanstring(self.document.getroot().cssselect('div#message_heading div.username span.name')[0])
        except IndexError:
            mails['member']['pseudo'] = 'Unknown'

        for li in reversed(self.document.xpath('//ul[@id="thread"]//li[contains(@id, "message_")]')):
            try:
                txt = self.parser.tostring(li.xpath('.//div[@class="message_body"]')[0])
            except IndexError:
                continue # 'Match' message
            txt = html2text(txt).strip()

            m = re.search(r'(\d+), ', li.xpath('.//span[@class="timestamp"]//script')[0].text)
            assert m
            date = local2utc(datetime.fromtimestamp(int(m.group(1))))

            id_from = li.find('a').attrib['href'].split('/')[-1].split('?')[0]

            mails['messages'].append({
                'date' : date,
                'message' : unicode(txt),
                'id_from' : unicode(id_from),
            })

        return mails
Beispiel #13
0
    def do_status(self, line):
        """
        status

        Display status information about a backend.
        """
        if len(line) > 0:
            backend_name = line
        else:
            backend_name = None

        results = {}
        for field in self.do('get_account_status',
                             backends=backend_name,
                             caps=CapAccount):
            if field.backend in results:
                results[field.backend].append(field)
            else:
                results[field.backend] = [field]

        for name, fields in results.iteritems():
            print(':: %s ::' % name)
            for f in fields:
                if f.flags & f.FIELD_HTML:
                    value = html2text(f.value)
                else:
                    value = f.value
                print('%s: %s' % (f.label, value))
            print('')
Beispiel #14
0
 def test_lefigaro(self):
     self.backend.RSS_FEED = "http://www.lefigaro.fr/rss/figaro_%s.xml" % self.backend.config[
         'feed'].get()
     l = list(self.backend.iter_threads())
     assert len(l)
     thread = self.backend.get_thread(l[0].id)
     assert len(thread.root.content)
     assert len(html2text(thread.root.content))
Beispiel #15
0
 def read_renew(self, id):
     for tr in self.document.getroot().xpath('//tr[@class="patFuncEntry"]'):
         if len(tr.xpath('td/input[@value="%s"]' % id)) > 0:
             message = self.browser.parser.tostring(
                 tr.xpath('td[@class="patFuncStatus"]')[0])
             renew = Renew(id)
             renew.message = html2text(message).replace('\n', '')
             return renew
Beispiel #16
0
 def on_load(self):
     # This aims to track input errors.
     script_error = CleanText(
         u"//script[contains(text(), 'if (\"true\"===\"true\")')]")(
             self.doc)
     if script_error:
         raise TransferBankError(CleanText().filter(
             html2text(
                 re.search(u'\.html\("(.*?)"\)', script_error).group(1))))
Beispiel #17
0
    def fill_special_advert(self, advert, div):
        advert.title = u'%s' % self.parser.select(div, 'div[@class="poste"]', 1, method='xpath').text
        description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath')
        advert.description = html2text(self.parser.tostring(description))

        titresmenuG = self.document.getroot().xpath('//div[@id="divmenuGauche"]')[0]
        contract_type = self.parser.select(titresmenuG, '//span[@itemprop="employmentType"]', method='xpath')
        if len(contract_type) != 0:
            advert.contract_type = u'%s' % contract_type[0].text_content()

        return self.fill_advert(advert, titresmenuG)
Beispiel #18
0
    def get_biography(self):
        bio = unicode()
        start = False
        tn = self.parser.select(self.document.getroot(), 'div#bio_content', 1)
        for el in tn.getchildren():
            if el.attrib.get('name') == 'mini_bio':
                start = True

            if start:
                bio += html2text(self.parser.tostring(el))

        return bio
Beispiel #19
0
    def get_biography(self):
        bio = unicode()
        start = False
        tn = self.parser.select(self.document.getroot(), 'div#bio_content', 1)
        for el in tn.getchildren():
            if el.attrib.get('name') == 'mini_bio':
                start = True

            if start:
                bio += html2text(self.parser.tostring(el))

        return bio
Beispiel #20
0
    def get_video(self, video=None):
        if not video:
            video = ArteLiveVideo(self.group_dict['id'])

        div = self.document.xpath('//div[@class="bloc-presentation"]')[0]

        description = self.parser.select(div,
                                         'div[@class="field field-name-body field-type-text-with-summary field-label-hidden bloc-rte"]',
                                         1,
                                         method='xpath')
        video.description = html2text(self.parser.tostring(description))

        json_url = self.document.xpath('//div[@class="video-container"]')[0].attrib['arte_vp_url']
        return json_url, video
Beispiel #21
0
    def fill_normal_advert(self, advert, div):
        advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text
        description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath')
        advert.description = html2text(self.parser.tostring(description))

        jobsummary = self.document.getroot().xpath('//div[@id="jobsummary_content"]')[0]
        contract_type = self.parser.select(jobsummary, 'dl/dd[@class="multipleddlast"]/span', method='xpath')
        if len(contract_type) != 0:
            advert.contract_type = u'%s' % contract_type[0].text_content()

        society_name = self.parser.select(jobsummary, '//span[@itemprop="name"]', method='xpath')
        if len(society_name) != 0:
            advert.society_name = u'%s' % society_name[0].text_content()

        return self.fill_advert(advert, jobsummary)
Beispiel #22
0
    def get_video(self, video=None):
        if not video:
            video = ArteLiveVideo('/%s' % self.group_dict['id'])

        div = self.document.xpath('//div[@class="bloc-presentation"]')[0]

        description = self.parser.select(
            div,
            'div[@class="field field-name-body field-type-text-with-summary field-label-hidden bloc-rte"]',
            1,
            method='xpath')
        video.description = html2text(self.parser.tostring(description))

        json_url = self.document.xpath(
            '//div[@class="video-container"]')[0].attrib['arte_vp_url']
        return json_url, video
Beispiel #23
0
    def get_job_advert(self, url, advert):
        content = self.document.getroot().xpath('//div[@id="offre-body"]')[0]
        if not advert:
            _id = self.parser.select(content, 'div/div/ul/li/div[@class="value"]/span', 1, method='xpath').text
            advert = PopolemploiJobAdvert(_id)

        advert.title = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip()
        advert.job_name = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip()

        description = self.parser.select(content, 'p[@itemprop="description"]', 1, method='xpath')
        advert.description = html2text(self.parser.tostring(description))

        society_name = self.parser.select(content, 'div[@class="vcard"]/p[@class="title"]/span', method='xpath')

        if society_name:
            advert.society_name = u'%s' % society_name[0].text

        advert.url = url

        place = u'%s' % self.parser.select(content,
                                           'dl/dd/ul/li[@itemprop="addressRegion"]',
                                           1, method='xpath').text
        advert.place = place.strip()

        contract_type = u'%s' % self.parser.select(content,
                                                   'dl/dd/span[@itemprop="employmentType"]',
                                                   1, method='xpath').text

        advert.contract_type = contract_type.strip()

        experience = u'%s' % self.parser.select(content,
                                                'dl/dd/span[@itemprop="experienceRequirements"]',
                                                1, method='xpath').text
        advert.experience = experience.strip()

        formation = u'%s' % self.parser.select(content,
                                               'dl/dd/span[@itemprop="qualifications"]',
                                               1, method='xpath').text
        advert.formation = formation.strip()

        pay = u'%s' % self.parser.select(content,
                                         'dl/dd/span[@itemprop="baseSalary"]',
                                         1, method='xpath').text
        advert.pay = pay.strip()

        return advert
Beispiel #24
0
    def set_video_metadata(self, video):

        head = self.parser.select(self.document.getroot(), 'head', 1)

        video.title = unicode(
            self.parser.select(head, 'meta[property="og:title"]',
                               1).get("content")).strip()
        video.author = unicode(
            self.parser.select(head, 'meta[name="author"]',
                               1).get("content")).strip()

        url = unicode(
            self.parser.select(head, 'meta[property="og:image"]',
                               1).get("content")).strip()
        # remove the useless anti-caching
        url = re.sub('\?\d+', '', url)
        video.thumbnail = BaseImage(url)
        video.thumbnail.url = video.thumbnail.id

        try:
            parts = self.parser.select(head, 'meta[property="video:duration"]',
                                       1).get("content").strip().split(':')
        except BrokenPageError:
            # it's probably a live, np.
            video.duration = NotAvailable
        else:
            if len(parts) == 1:
                seconds = parts[0]
                hours = minutes = 0
            elif len(parts) == 2:
                minutes, seconds = parts
                hours = 0
            elif len(parts) == 3:
                hours, minutes, seconds = parts
            else:
                raise BrokenPageError('Unable to parse duration %r' % parts)
            video.duration = datetime.timedelta(hours=int(hours),
                                                minutes=int(minutes),
                                                seconds=int(seconds))

        try:
            video.description = html2text(
                self.parser.select(head, 'meta[property="og:description"]',
                                   1).get("content")).strip() or unicode()
        except BrokenPageError:
            video.description = u''
Beispiel #25
0
    def get_job_advert(self, url, advert):
        re_id = re.compile(
            'http://www.adecco.fr/trouver-un-emploi/Pages/Details-de-l-Offre/(.*?)/(.*?).aspx\?IOF=(.*?)$',
            re.DOTALL)
        if advert is None:
            _id = u'%s/%s/%s' % (re_id.search(url).group(1),
                                 re_id.search(url).group(2),
                                 re_id.search(url).group(3))
            advert = AdeccoJobAdvert(_id)

        advert.contract_type = re_id.search(url).group(1)
        div = self.document.getroot().xpath(
            "//div[@class='contain_MoreResults']")[0]

        date = u'%s' % self.parser.select(
            div, "div[@class='dateResult']", 1, method='xpath').text.strip()
        m = re.match('(\d{2})\s(.*?)\s(\d{4})', date)
        if m:
            dd = int(m.group(1))
            mm = MONTHS.index(m.group(2)) + 1
            yyyy = int(m.group(3))
            advert.publication_date = datetime.date(yyyy, mm, dd)

        title = self.parser.select(div, "h1", 1,
                                   method='xpath').text_content().strip()
        town = self.parser.select(div,
                                  "h1/span/span[@class='town']",
                                  1,
                                  method='xpath').text_content()
        page_title = self.parser.select(div,
                                        "h1/span[@class='pageTitle']",
                                        1,
                                        method='xpath').text_content()
        advert.title = u'%s' % title.replace(town, '').replace(page_title, '')

        spans = self.document.getroot().xpath(
            "//div[@class='jobGreyContain']/table/tr/td/span[@class='value']")
        advert.job_name = u'%s' % spans[0].text
        advert.place = u'%s' % spans[1].text
        advert.pay = u'%s' % spans[2].text
        advert.contract_type = u'%s' % spans[3].text
        advert.url = url
        description = self.document.getroot().xpath(
            "//div[@class='descriptionContainer']/p")[0]
        advert.description = html2text(self.parser.tostring(description))
        return advert
Beispiel #26
0
    def get_job_advert(self, url, advert):
        re_id_title = re.compile(
            '/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)',
            re.DOTALL)
        if advert is None:
            _id = u'%s/%s' % (re_id_title.search(url).group(1),
                              re_id_title.search(url).group(2))
            advert = ApecJobAdvert(_id)
            advert.title = re_id_title.search(url).group(2).replace('-', ' ')

        description = self.document.getroot().xpath(
            "//div[@class='contentWithDashedBorderTop marginTop boxContent']/div"
        )[0]
        advert.description = html2text(self.parser.tostring(description))

        advert.job_name = advert.title

        trs = self.document.getroot().xpath(
            "//table[@class='noFieldsTable']/tr")
        for tr in trs:
            th = self.parser.select(tr, 'th', 1, method='xpath')
            td = self.parser.select(tr, 'td', 1, method='xpath')
            if u'Date de publication' in u'%s' % th.text_content():
                advert.publication_date = dateutil.parser.parse(
                    td.text_content()).date()
            elif u'Société' in u'%s' % th.text_content(
            ) and not advert.society_name:
                society_name = td.text_content()
                a = self.parser.select(td, 'a', method='xpath')
                if a:
                    advert.society_name = u'%s' % society_name.replace(
                        a[0].text_content(), '').strip()
                else:
                    advert.society_name = society_name.strip()
            elif u'Type de contrat' in u'%s' % th.text_content():
                advert.contract_type = u'%s' % td.text_content().strip()
            elif u'Lieu' in u'%s' % th.text_content():
                advert.place = u'%s' % td.text_content()
            elif u'Salaire' in u'%s' % th.text_content():
                advert.pay = u'%s' % td.text_content()
            elif u'Expérience' in u'%s' % th.text_content():
                advert.experience = u'%s' % td.text_content()

        advert.url = url
        return advert
Beispiel #27
0
    def format_obj(self, obj, alias):
        if hasattr(obj, 'message') and obj.message:
            message = obj.message
        else:
            message = u'%s (%s)' % (obj.shop.name, obj.shop.location)

        result = u'%s%s%s\n' % (self.BOLD, message, self.NC)
        result += u'ID: %s\n' % obj.fullid
        result += u'Product: %s\n' % obj.product.name
        result += u'Cost: %s%s\n' % (obj.cost, obj.currency)
        if hasattr(obj, 'date') and obj.date:
            result += u'Date: %s\n' % obj.date.strftime('%Y-%m-%d')

        result += u'\n%sShop:%s\n' % (self.BOLD, self.NC)
        result += u'\tName: %s\n' % obj.shop.name
        if obj.shop.location:
            result += u'\tLocation: %s\n' % obj.shop.location
        if obj.shop.info:
            result += u'\n\t' + html2text(obj.shop.info).replace('\n', '\n\t').strip()

        return result
Beispiel #28
0
    def format_obj(self, obj, alias):
        if hasattr(obj, 'message') and obj.message:
            message = obj.message
        else:
            message = u'%s (%s)' % (obj.shop.name, obj.shop.location)

        result = u'%s%s%s\n' % (self.BOLD, message, self.NC)
        result += u'ID: %s\n' % obj.fullid
        result += u'Product: %s\n' % obj.product.name
        result += u'Cost: %s%s\n' % (obj.cost, obj.currency)
        if hasattr(obj, 'date') and obj.date:
            result += u'Date: %s\n' % obj.date.strftime('%Y-%m-%d')

        result += u'\n%sShop:%s\n' % (self.BOLD, self.NC)
        result += u'\tName: %s\n' % obj.shop.name
        if obj.shop.location:
            result += u'\tLocation: %s\n' % obj.shop.location
        if obj.shop.info:
            result += u'\n\t' + html2text(obj.shop.info).replace('\n', '\n\t').strip()

        return result
Beispiel #29
0
 def format_obj(self, obj, alias):
     result = u'%s %s %s %s %s\n' % (
         self.colored(obj.project.name, 'blue',
                      'bold'), self.colored(u'—', 'cyan', 'bold'),
         self.colored(obj.fullid, 'red',
                      'bold'), self.colored(u'—', 'cyan', 'bold'),
         self.colored(obj.title, 'yellow', 'bold'))
     result += '\n%s\n\n' % obj.body
     result += self.format_key('Author',
                               '%s (%s)' % (obj.author.name, obj.creation))
     result += self.format_attr(obj, 'status')
     result += self.format_attr(obj, 'priority')
     result += self.format_attr(obj, 'version')
     result += self.format_attr(obj, 'tracker')
     result += self.format_attr(obj, 'category')
     result += self.format_attr(obj, 'assignee')
     if hasattr(obj, 'fields') and not empty(obj.fields):
         for key, value in obj.fields.items():
             result += self.format_key(key.capitalize(), value)
     if hasattr(obj, 'attachments') and obj.attachments:
         result += '\n%s\n' % self.colored('Attachments:', 'green')
         for a in obj.attachments:
             result += '* %s%s%s <%s>\n' % (self.BOLD, a.filename, self.NC,
                                            a.url)
     if hasattr(obj, 'history') and obj.history:
         result += '\n%s\n' % self.colored('History:', 'green')
         for u in obj.history:
             result += '%s %s %s %s\n' % (
                 self.colored('*', 'red', 'bold'),
                 self.colored(u.date, 'yellow',
                              'bold'), self.colored(u'—', 'cyan', 'bold'),
                 self.colored(u.author.name, 'blue', 'bold'))
             for change in u.changes:
                 result += '  - %s %s %s %s\n' % (
                     self.colored(change.field, 'green'), change.last,
                     self.colored('->', 'magenta'), change.new)
             if u.message:
                 result += '    %s\n' % html2text(
                     u.message).strip().replace('\n', '\n    ')
     return result
Beispiel #30
0
    def get_thread_mails(self):
        mails = {
            'member': {},
            'messages': [],
        }

        try:
            mails['member']['pseudo'] = self.parser.tocleanstring(
                self.document.getroot().cssselect(
                    '#message_heading div.username span.name')[0])
        except IndexError:
            mails['member']['pseudo'] = 'Unknown'

        for li in reversed(
                self.document.xpath(
                    '//ul[@id="thread"]//li[contains(@id, "message_")]')):
            try:
                txt = self.parser.tostring(
                    li.xpath('.//div[@class="message_body"]')[0])
            except IndexError:
                continue  # 'Match' message
            txt = html2text(txt).strip()

            m = re.search(
                r'(\d+), ',
                li.xpath('.//span[@class="timestamp"]//script')[0].text)
            assert m
            date = local2utc(datetime.fromtimestamp(int(m.group(1))))

            id_from = li.find('a').attrib['href'].split('/')[-1].split('?')[0]

            mails['messages'].append({
                'date': date,
                'message': unicode(txt),
                'id_from': unicode(id_from),
            })

        return mails
Beispiel #31
0
    def get_job_advert(self, url, advert):
        re_id_title = re.compile('/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)', re.DOTALL)
        if advert is None:
            _id = u'%s/%s' % (re_id_title.search(url).group(1), re_id_title.search(url).group(2))
            advert = ApecJobAdvert(_id)
            advert.title = re_id_title.search(url).group(2).replace('-', ' ')

        description = self.document.getroot().xpath("//div[@class='contentWithDashedBorderTop marginTop boxContent']/div")[0]
        advert.description = html2text(self.parser.tostring(description))

        advert.job_name = advert.title

        trs = self.document.getroot().xpath("//table[@class='noFieldsTable']/tr")
        for tr in trs:
            th = self.parser.select(tr, 'th', 1, method='xpath')
            td = self.parser.select(tr, 'td', 1, method='xpath')
            if u'Date de publication' in u'%s' % th.text_content():
                advert.publication_date = dateutil.parser.parse(td.text_content()).date()
            elif u'Société' in u'%s' % th.text_content() and not advert.society_name:
                society_name = td.text_content()
                a = self.parser.select(td, 'a', method='xpath')
                if a:
                    advert.society_name = u'%s' % society_name.replace(a[0].text_content(), '').strip()
                else:
                    advert.society_name = society_name.strip()
            elif u'Type de contrat' in u'%s' % th.text_content():
                advert.contract_type = u'%s' % td.text_content().strip()
            elif u'Lieu' in u'%s' % th.text_content():
                advert.place = u'%s' % td.text_content()
            elif u'Salaire' in u'%s' % th.text_content():
                advert.pay = u'%s' % td.text_content()
            elif u'Expérience' in u'%s' % th.text_content():
                advert.experience = u'%s' % td.text_content()

        advert.url = url
        return advert
Beispiel #32
0
    def send_email(self, backend_name, mail):
        domain = self.config.get('domain')
        recipient = self.config.get('recipient')

        parent_message = mail.parent
        references = []
        while parent_message:
            references.append(u'<%s.%s@%s>' % (backend_name, mail.parent.full_id, domain))
            parent_message = parent_message.parent
        subject = mail.title
        sender = u'"%s" <%s@%s>' % (mail.sender.replace('"', '""') if mail.sender else '',
                                    backend_name, domain)

        # assume that .date is an UTC datetime
        date = formatdate(time.mktime(utc2local(mail.date).timetuple()), localtime=True)
        msg_id = u'<%s.%s@%s>' % (backend_name, mail.full_id, domain)

        if self.config.get('html') and mail.flags & mail.IS_HTML:
            body = mail.content
            content_type = 'html'
        else:
            if mail.flags & mail.IS_HTML:
                body = html2text(mail.content)
            else:
                body = mail.content
            content_type = 'plain'

        if body is None:
            body = ''

        if mail.signature:
            if self.config.get('html') and mail.flags & mail.IS_HTML:
                body += u'<p>-- <br />%s</p>' % mail.signature
            else:
                body += u'\n\n-- \n'
                if mail.flags & mail.IS_HTML:
                    body += html2text(mail.signature)
                else:
                    body += mail.signature

        # Header class is smart enough to try US-ASCII, then the charset we
        # provide, then fall back to UTF-8.
        header_charset = 'ISO-8859-1'

        # We must choose the body charset manually
        for body_charset in 'US-ASCII', 'ISO-8859-1', 'UTF-8':
            try:
                body.encode(body_charset)
            except UnicodeError:
                pass
            else:
                break

        # Split real name (which is optional) and email address parts
        sender_name, sender_addr = parseaddr(sender)
        recipient_name, recipient_addr = parseaddr(recipient)

        # We must always pass Unicode strings to Header, otherwise it will
        # use RFC 2047 encoding even on plain ASCII strings.
        sender_name = str(Header(unicode(sender_name), header_charset))
        recipient_name = str(Header(unicode(recipient_name), header_charset))

        # Make sure email addresses do not contain non-ASCII characters
        sender_addr = sender_addr.encode('ascii')
        recipient_addr = recipient_addr.encode('ascii')

        # Create the message ('plain' stands for Content-Type: text/plain)
        msg = MIMEText(body.encode(body_charset), content_type, body_charset)
        msg['From'] = formataddr((sender_name, sender_addr))
        msg['To'] = formataddr((recipient_name, recipient_addr))
        msg['Subject'] = Header(unicode(subject), header_charset)
        msg['Message-Id'] = msg_id
        msg['Date'] = date
        if references:
            msg['In-Reply-To'] = references[0]
            msg['References'] = u" ".join(reversed(references))

        self.logger.info('Send mail from <%s> to <%s>' % (sender, recipient))
        if len(self.config.get('pipe')) > 0:
            p = subprocess.Popen(self.config.get('pipe'),
                                 shell=True,
                                 stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)
            p.stdin.write(msg.as_string())
            p.stdin.close()
            if p.wait() != 0:
                self.logger.error('Unable to deliver mail: %s' % p.stdout.read().strip())
                return False
        else:
            # Send the message via SMTP to localhost:25
            try:
                smtp = SMTP(self.config.get('smtp'))
                smtp.sendmail(sender, recipient, msg.as_string())
            except Exception as e:
                self.logger.error('Unable to deliver mail: %s' % e)
                return False
            else:
                smtp.quit()

        return True
Beispiel #33
0
 def test_lefigaro(self):
     l = list(self.backend.iter_threads())
     assert len(l)
     thread = self.backend.get_thread(l[0].id)
     assert len(thread.root.content)
     assert len(html2text(thread.root.content))
Beispiel #34
0
 def clean(cls, txt, options=None):
     if not isinstance(txt, basestring):
         txt = html.tostring(txt, encoding=unicode)
     options = options or {}
     return html2text(txt, **options)
Beispiel #35
0
 def clean(cls, txt):
     if not isinstance(txt, basestring):
         txt = html.tostring(txt, encoding=unicode)
     return html2text(txt)
Beispiel #36
0
    def send_email(self, backend_name, mail):
        domain = self.config.get('domain')
        recipient = self.config.get('recipient')

        parent_message = mail.parent
        references = []
        while parent_message:
            references.append(u'<%s.%s@%s>' %
                              (backend_name, mail.parent.full_id, domain))
            parent_message = parent_message.parent
        subject = mail.title
        sender = u'"%s" <%s@%s>' % (mail.sender.replace('"', '""') if
                                    mail.sender else '', backend_name, domain)

        # assume that .date is an UTC datetime
        date = formatdate(time.mktime(utc2local(mail.date).timetuple()),
                          localtime=True)
        msg_id = u'<%s.%s@%s>' % (backend_name, mail.full_id, domain)

        if self.config.get('html') and mail.flags & mail.IS_HTML:
            body = mail.content
            content_type = 'html'
        else:
            if mail.flags & mail.IS_HTML:
                body = html2text(mail.content)
            else:
                body = mail.content
            content_type = 'plain'

        if body is None:
            body = ''

        if mail.signature:
            if self.config.get('html') and mail.flags & mail.IS_HTML:
                body += u'<p>-- <br />%s</p>' % mail.signature
            else:
                body += u'\n\n-- \n'
                if mail.flags & mail.IS_HTML:
                    body += html2text(mail.signature)
                else:
                    body += mail.signature

        # Header class is smart enough to try US-ASCII, then the charset we
        # provide, then fall back to UTF-8.
        header_charset = 'ISO-8859-1'

        # We must choose the body charset manually
        for body_charset in 'US-ASCII', 'ISO-8859-1', 'UTF-8':
            try:
                body.encode(body_charset)
            except UnicodeError:
                pass
            else:
                break

        # Split real name (which is optional) and email address parts
        sender_name, sender_addr = parseaddr(sender)
        recipient_name, recipient_addr = parseaddr(recipient)

        # We must always pass Unicode strings to Header, otherwise it will
        # use RFC 2047 encoding even on plain ASCII strings.
        sender_name = str(Header(unicode(sender_name), header_charset))
        recipient_name = str(Header(unicode(recipient_name), header_charset))

        # Make sure email addresses do not contain non-ASCII characters
        sender_addr = sender_addr.encode('ascii')
        recipient_addr = recipient_addr.encode('ascii')

        # Create the message ('plain' stands for Content-Type: text/plain)
        msg = MIMEText(body.encode(body_charset), content_type, body_charset)
        msg['From'] = formataddr((sender_name, sender_addr))
        msg['To'] = formataddr((recipient_name, recipient_addr))
        msg['Subject'] = Header(unicode(subject), header_charset)
        msg['Message-Id'] = msg_id
        msg['Date'] = date
        if references:
            msg['In-Reply-To'] = references[0]
            msg['References'] = u" ".join(reversed(references))

        self.logger.info('Send mail from <%s> to <%s>' % (sender, recipient))
        if len(self.config.get('pipe')) > 0:
            p = subprocess.Popen(self.config.get('pipe'),
                                 shell=True,
                                 stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)
            p.stdin.write(msg.as_string())
            p.stdin.close()
            if p.wait() != 0:
                self.logger.error('Unable to deliver mail: %s' %
                                  p.stdout.read().strip())
                return False
        else:
            # Send the message via SMTP to localhost:25
            try:
                smtp = SMTP(self.config.get('smtp'))
                smtp.sendmail(sender, recipient, msg.as_string())
            except Exception as e:
                self.logger.error('Unable to deliver mail: %s' % e)
                return False
            else:
                smtp.quit()

        return True
Beispiel #37
0
 def clean(cls, txt):
     if not isinstance(txt, basestring):
         txt = html.tostring(txt, encoding=unicode)
     return html2text(txt)
Beispiel #38
0
    def get_torrent(self, id):
        table = self.browser.parser.select(self.document.getroot(), 'div.thin',
                                           1)

        h2 = table.xpath('.//h2')
        if len(h2) > 0:
            title = u''.join([txt.strip() for txt in h2[0].itertext()])
        else:
            title = self.browser.parser.select(table, 'div.title_text', 1).text

        torrent = Torrent(id, title)
        torrentid = id.split('.', 1)[1] if '.' in id else id
        table = self.browser.parser.select(self.document.getroot(),
                                           'table.torrent_table')
        if len(table) == 0:
            table = self.browser.parser.select(self.document.getroot(),
                                               'div.main_column', 1)
            is_table = False
        else:
            table = table[0]
            is_table = True

        for tr in table.findall('tr' if is_table else 'div'):
            if is_table and 'group_torrent' in tr.attrib.get('class', ''):
                tds = tr.findall('td')

                if len(tds) != 5:
                    continue

                url = tds[0].find('span').find('a').attrib['href']
                m = self.TORRENTID_REGEXP.match(url)
                if not m:
                    warning('ID not found')
                    continue
                if m.group(1) != torrentid:
                    continue

                torrent.url = self.format_url(url)
                size, unit = tds[1].text.split()
                torrent.size = get_bytes_size(float(size.replace(',', '')),
                                              unit)
                torrent.seeders = int(tds[3].text)
                torrent.leechers = int(tds[4].text)
                break
            elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') \
                    and tr.attrib.get('class', '').endswith('pad'):
                url = tr.cssselect('a[title=Download]')[0].attrib['href']
                m = self.TORRENTID_REGEXP.match(url)
                if not m:
                    warning('ID not found')
                    continue
                if m.group(1) != torrentid:
                    continue

                torrent.url = self.format_url(url)
                size, unit = tr.cssselect(
                    'div.details_title strong')[-1].text.strip('()').split()
                torrent.size = get_bytes_size(float(size.replace(',', '')),
                                              unit)
                torrent.seeders = int(
                    tr.cssselect('img[title=Seeders]')[0].tail)
                torrent.leechers = int(
                    tr.cssselect('img[title=Leechers]')[0].tail)
                break

        if not torrent.url:
            warning('Torrent %s not found in list' % torrentid)
            return None

        div = self.parser.select(self.document.getroot(), 'div.main_column', 1)
        for box in div.cssselect('div.box'):
            title = None
            body = None

            title_t = box.cssselect('div.head')
            if len(title_t) > 0:
                title_t = title_t[0]
                if title_t.find('strong') is not None:
                    title_t = title_t.find('strong')
                if title_t.text is not None:
                    title = title_t.text.strip()

            body_t = box.cssselect('div.body,div.desc')
            if body_t:
                body = html2text(self.parser.tostring(body_t[-1])).strip()

            if title and body:
                if torrent.description is NotLoaded:
                    torrent.description = u''
                torrent.description += u'%s\n\n%s\n' % (title, body)

        divs = self.document.getroot().cssselect(
            'div#files_%s,div#filelist_%s,tr#torrent_%s td' %
            (torrentid, torrentid, torrentid))
        if divs:
            torrent.files = []
            for div in divs:
                table = div.find('table')
                if table is None:
                    continue
                for tr in table:
                    if tr.attrib.get('class', None) != 'colhead_dark':
                        torrent.files.append(tr.find('td').text)

        return torrent
Beispiel #39
0
 def get_error_message(self, error):
     key = 'app.identification.erreur.' + str(error)
     try:
         return html2text(self.doc[key])
     except KeyError:
         return None
Beispiel #40
0
 def get_value(self, profile, consts):
     return html2text(unicode(profile[self.key])).strip()
Beispiel #41
0
    def get_torrent(self, id):
        table = self.browser.parser.select(self.document.getroot(), 'div.thin', 1)

        h2 = table.xpath('.//h2')
        if len(h2) > 0:
            title = u''.join([txt.strip() for txt in h2[0].itertext()])
        else:
            title = self.browser.parser.select(table, 'div.title_text', 1).text

        torrent = Torrent(id, title)
        if '.' in id:
            torrentid = id.split('.', 1)[1]
        else:
            torrentid = id
        table = self.browser.parser.select(self.document.getroot(), 'table.torrent_table')
        if len(table) == 0:
            table = self.browser.parser.select(self.document.getroot(), 'div.main_column', 1)
            is_table = False
        else:
            table = table[0]
            is_table = True

        for tr in table.findall('tr' if is_table else 'div'):
            if is_table and 'group_torrent' in tr.attrib.get('class', ''):
                tds = tr.findall('td')

                if not len(tds) == 5:
                    continue

                url = tds[0].find('span').find('a').attrib['href']
                m = self.TORRENTID_REGEXP.match(url)
                if not m:
                    warning('ID not found')
                    continue
                if m.group(1) != torrentid:
                    continue

                torrent.url = self.format_url(url)
                size, unit = tds[1].text.split()
                torrent.size = get_bytes_size(float(size.replace(',', '')), unit)
                torrent.seeders = int(tds[3].text)
                torrent.leechers = int(tds[4].text)
                break
            elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') \
                    and tr.attrib.get('class', '').endswith('pad'):
                url = tr.cssselect('a[title=Download]')[0].attrib['href']
                m = self.TORRENTID_REGEXP.match(url)
                if not m:
                    warning('ID not found')
                    continue
                if m.group(1) != torrentid:
                    continue

                torrent.url = self.format_url(url)
                size, unit = tr.cssselect('div.details_title strong')[-1].text.strip('()').split()
                torrent.size = get_bytes_size(float(size.replace(',', '')), unit)
                torrent.seeders = int(tr.cssselect('img[title=Seeders]')[0].tail)
                torrent.leechers = int(tr.cssselect('img[title=Leechers]')[0].tail)
                break

        if not torrent.url:
            warning('Torrent %s not found in list' % torrentid)
            return None

        div = self.parser.select(self.document.getroot(), 'div.main_column', 1)
        for box in div.cssselect('div.box'):
            title = None
            body = None

            title_t = box.cssselect('div.head')
            if len(title_t) > 0:
                title_t = title_t[0]
                if title_t.find('strong') is not None:
                    title_t = title_t.find('strong')
                if title_t.text is not None:
                    title = title_t.text.strip()

            body_t = box.cssselect('div.body,div.desc')
            if body_t:
                body = html2text(self.parser.tostring(body_t[-1])).strip()

            if title and body:
                if torrent.description is NotLoaded:
                    torrent.description = u''
                torrent.description += u'%s\n\n%s\n' % (title, body)

        divs = self.document.getroot().cssselect('div#files_%s,div#filelist_%s,tr#torrent_%s td' % (torrentid, torrentid, torrentid))
        if divs:
            torrent.files = []
            for div in divs:
                table = div.find('table')
                if table is None:
                    continue
                for tr in table:
                    if tr.attrib.get('class', None) != 'colhead_dark':
                        torrent.files.append(tr.find('td').text)

        return torrent
Beispiel #42
0
 def get_error_message(self, error):
     key = 'app.identification.erreur.' + str(error)
     try:
         return html2text(self.doc[key])
     except KeyError:
         return None
Beispiel #43
0
 def get_value(self, profile, consts):
     return html2text(unicode(profile[self.key])).strip()
Beispiel #44
0
 def clean(cls, txt, options=None):
     if not isinstance(txt, basestring):
         txt = html.tostring(txt, encoding=unicode)
     options = options or {}
     return html2text(txt, **options)
Beispiel #45
0
 def on_load(self):
     # This aims to track input errors.
     script_error = CleanText(u"//script[contains(text(), 'if (\"true\"===\"true\")')]")(self.doc)
     if script_error:
         raise TransferBankError(CleanText().filter(html2text(re.search(u'\.html\("(.*?)"\)', script_error).group(1))))