def format_obj(self, obj, alias): result = u'%sTitle:%s %s\n' % (self.BOLD, self.NC, obj.title) result += u'%sDate:%s %s\n' % (self.BOLD, self.NC, obj.date.strftime('%Y-%m-%d %H:%M')) result += u'%sFrom:%s %s\n' % (self.BOLD, self.NC, obj.sender) if hasattr(obj, 'receivers') and obj.receivers: result += u'%sTo:%s %s\n' % (self.BOLD, self.NC, ', '.join(obj.receivers)) if obj.flags & Message.IS_HTML: content = html2text(obj.content) else: content = obj.content result += '\n%s' % content if obj.signature: if obj.flags & Message.IS_HTML: signature = html2text(obj.signature) else: signature = obj.signature result += '\n-- \n%s' % signature return result
def filter(self, el): _resume = el[0].xpath("p[@data-rel='full-resume']") if not _resume: _resume = el[0].xpath("p[@data-rel='small-resume']") if _resume: resume = html2text(CleanText(_resume[0])(self))[6:] return resume
def do_status(self, line): """ status Display status information about a backend. """ if len(line) > 0: backend_name = line else: backend_name = None results = {} for field in self.do('get_account_status', backends=backend_name, caps=CapAccount): if field.backend in results: results[field.backend].append(field) else: results[field.backend] = [field] for name, fields in results.iteritems(): print(':: %s ::' % name) for f in fields: if f.flags & f.FIELD_HTML: value = html2text(f.value) else: value = f.value print('%s: %s' % (f.label, value)) print('')
def read_renew(self, id): for tr in self.document.getroot().xpath('//tr[@class="patFuncEntry"]'): if len(tr.xpath('td/input[@value="%s"]' % id)) > 0: message = self.browser.parser.tostring(tr.xpath('td[@class="patFuncStatus"]')[0]) renew = Renew(id) renew.message = html2text(message).replace('\n', '') return renew
def fill_gallery(self, gallery): gallery.title = self.document.xpath("//h1[@id='gn']/text()")[0] try: gallery.original_title = self.document.xpath("//h1[@id='gj']/text()")[0] except IndexError: gallery.original_title = None description_div = self.document.xpath("//div[@id='gd71']")[0] description_html = self.parser.tostring(description_div) gallery.description = html2text(description_html) cardinality_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Images:']/td[@class='gdt2']/text()")[0] gallery.cardinality = int(re.match(r"\d+", cardinality_string).group(0)) date_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Posted:']/td[@class='gdt2']/text()")[0] gallery.date = datetime.strptime(date_string, "%Y-%m-%d %H:%M") rating_string = self.document.xpath("//td[@id='rating_label']/text()")[0] rating_match = re.search(r"\d+\.\d+", rating_string) gallery.rating = None if rating_match is None else float(rating_match.group(0)) gallery.rating_max = 5 try: thumbnail_url = self.document.xpath("//div[@class='gdtm']/a/img/attribute::src")[0] except IndexError: thumbnail_style = self.document.xpath("//div[@class='gdtm']/div/attribute::style")[0] thumbnail_url = re.search(r"background:[^;]+url\((.+?)\)", thumbnail_style).group(1) gallery.thumbnail = BaseImage(thumbnail_url) gallery.thumbnail.url = gallery.thumbnail.id
def fill_gallery(self, gallery): gallery.title = self.document.xpath("//h1[@id='gn']/text()")[0] try: gallery.original_title = self.document.xpath("//h1[@id='gj']/text()")[0] except IndexError: gallery.original_title = None description_div = self.document.xpath("//div[@id='gd71']")[0] description_html = self.parser.tostring(description_div) gallery.description = html2text(description_html) cardinality_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Images:']/td[@class='gdt2']/text()")[0] gallery.cardinality = int(re.match(r"\d+", cardinality_string).group(0)) date_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Posted:']/td[@class='gdt2']/text()")[0] gallery.date = datetime.strptime(date_string, "%Y-%m-%d %H:%M") rating_string = self.document.xpath("//td[@id='rating_label']/text()")[0] rating_match = re.search(r"\d+\.\d+", rating_string) if rating_match is None: gallery.rating = None else: gallery.rating = float(rating_match.group(0)) gallery.rating_max = 5 try: thumbnail_url = self.document.xpath("//div[@class='gdtm']/a/img/attribute::src")[0] except IndexError: thumbnail_style = self.document.xpath("//div[@class='gdtm']/div/attribute::style")[0] thumbnail_url = re.search(r"background:[^;]+url\((.+?)\)", thumbnail_style).group(1) gallery.thumbnail = BaseImage(thumbnail_url) gallery.thumbnail.url = gallery.thumbnail.id
def set_video_metadata(self, video): head = self.parser.select(self.document.getroot(), 'head', 1) video.title = unicode(self.parser.select(head, 'meta[property="og:title"]', 1).get("content")).strip() video.author = unicode(self.parser.select(head, 'meta[name="author"]', 1).get("content")).strip() url = unicode(self.parser.select(head, 'meta[property="og:image"]', 1).get("content")).strip() # remove the useless anti-caching url = re.sub('\?\d+', '', url) video.thumbnail = BaseImage(url) video.thumbnail.url = video.thumbnail.id try: parts = self.parser.select(head, 'meta[property="video:duration"]', 1).get("content").strip().split(':') except BrokenPageError: # it's probably a live, np. video.duration = NotAvailable else: if len(parts) == 1: seconds = parts[0] hours = minutes = 0 elif len(parts) == 2: minutes, seconds = parts hours = 0 elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError('Unable to parse duration %r' % parts) video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) try: video.description = html2text(self.parser.select(head, 'meta[property="og:description"]', 1).get("content")).strip() or unicode() except BrokenPageError: video.description = u''
def get_job_advert(self, url, advert): re_id = re.compile( "http://www.adecco.fr/trouver-un-emploi/Pages/Details-de-l-Offre/(.*?)/(.*?).aspx\?IOF=(.*?)$", re.DOTALL ) if advert is None: _id = u"%s/%s/%s" % (re_id.search(url).group(1), re_id.search(url).group(2), re_id.search(url).group(3)) advert = AdeccoJobAdvert(_id) advert.contract_type = re_id.search(url).group(1) div = self.document.getroot().xpath("//div[@class='contain_MoreResults']")[0] date = u"%s" % self.parser.select(div, "div[@class='dateResult']", 1, method="xpath").text.strip() m = re.match("(\d{2})\s(.*?)\s(\d{4})", date) if m: dd = int(m.group(1)) mm = MONTHS.index(m.group(2)) + 1 yyyy = int(m.group(3)) advert.publication_date = datetime.date(yyyy, mm, dd) title = self.parser.select(div, "h1", 1, method="xpath").text_content().strip() town = self.parser.select(div, "h1/span/span[@class='town']", 1, method="xpath").text_content() page_title = self.parser.select(div, "h1/span[@class='pageTitle']", 1, method="xpath").text_content() advert.title = u"%s" % title.replace(town, "").replace(page_title, "") spans = self.document.getroot().xpath("//div[@class='jobGreyContain']/table/tr/td/span[@class='value']") advert.job_name = u"%s" % spans[0].text advert.place = u"%s" % spans[1].text advert.pay = u"%s" % spans[2].text advert.contract_type = u"%s" % spans[3].text advert.url = url description = self.document.getroot().xpath("//div[@class='descriptionContainer']/p")[0] advert.description = html2text(self.parser.tostring(description)) return advert
def test_lefigaro(self): self.backend.RSS_FEED = "http://www.lefigaro.fr/rss/figaro_%s.xml" % self.backend.config['feed'].get() l = list(self.backend.iter_threads()) assert len(l) thread = self.backend.get_thread(l[0].id) assert len(thread.root.content) assert len(html2text(thread.root.content))
def format_obj(self, obj, alias): result = u'%s %s %s %s %s\n' % (self.colored(obj.project.name, 'blue', 'bold'), self.colored(u'—', 'cyan', 'bold'), self.colored(obj.fullid, 'red', 'bold'), self.colored(u'—', 'cyan', 'bold'), self.colored(obj.title, 'yellow', 'bold')) result += '\n%s\n\n' % obj.body result += self.format_key('Author', '%s (%s)' % (obj.author.name, obj.creation)) result += self.format_attr(obj, 'status') result += self.format_attr(obj, 'priority') result += self.format_attr(obj, 'version') result += self.format_attr(obj, 'tracker') result += self.format_attr(obj, 'category') result += self.format_attr(obj, 'assignee') if hasattr(obj, 'fields') and not empty(obj.fields): for key, value in obj.fields.iteritems(): result += self.format_key(key.capitalize(), value) if hasattr(obj, 'attachments') and obj.attachments: result += '\n%s\n' % self.colored('Attachments:', 'green') for a in obj.attachments: result += '* %s%s%s <%s>\n' % (self.BOLD, a.filename, self.NC, a.url) if hasattr(obj, 'history') and obj.history: result += '\n%s\n' % self.colored('History:', 'green') for u in obj.history: result += '%s %s %s %s\n' % (self.colored('*', 'red', 'bold'), self.colored(u.date, 'yellow', 'bold'), self.colored(u'—', 'cyan', 'bold'), self.colored(u.author.name, 'blue', 'bold')) for change in u.changes: result += ' - %s %s %s %s\n' % (self.colored(change.field, 'green'), change.last, self.colored('->', 'magenta'), change.new) if u.message: result += ' %s\n' % html2text(u.message).strip().replace('\n', '\n ') return result
def get_thread_mails(self): mails = { 'member' : {}, 'messages' : [], } try: mails['member']['pseudo'] = self.parser.tocleanstring(self.document.getroot().cssselect('div#message_heading div.username span.name')[0]) except IndexError: mails['member']['pseudo'] = 'Unknown' for li in reversed(self.document.xpath('//ul[@id="thread"]//li[contains(@id, "message_")]')): try: txt = self.parser.tostring(li.xpath('.//div[@class="message_body"]')[0]) except IndexError: continue # 'Match' message txt = html2text(txt).strip() m = re.search(r'(\d+), ', li.xpath('.//span[@class="timestamp"]//script')[0].text) assert m date = local2utc(datetime.fromtimestamp(int(m.group(1)))) id_from = li.find('a').attrib['href'].split('/')[-1].split('?')[0] mails['messages'].append({ 'date' : date, 'message' : unicode(txt), 'id_from' : unicode(id_from), }) return mails
def test_lefigaro(self): self.backend.RSS_FEED = "http://www.lefigaro.fr/rss/figaro_%s.xml" % self.backend.config[ 'feed'].get() l = list(self.backend.iter_threads()) assert len(l) thread = self.backend.get_thread(l[0].id) assert len(thread.root.content) assert len(html2text(thread.root.content))
def read_renew(self, id): for tr in self.document.getroot().xpath('//tr[@class="patFuncEntry"]'): if len(tr.xpath('td/input[@value="%s"]' % id)) > 0: message = self.browser.parser.tostring( tr.xpath('td[@class="patFuncStatus"]')[0]) renew = Renew(id) renew.message = html2text(message).replace('\n', '') return renew
def on_load(self): # This aims to track input errors. script_error = CleanText( u"//script[contains(text(), 'if (\"true\"===\"true\")')]")( self.doc) if script_error: raise TransferBankError(CleanText().filter( html2text( re.search(u'\.html\("(.*?)"\)', script_error).group(1))))
def fill_special_advert(self, advert, div): advert.title = u'%s' % self.parser.select(div, 'div[@class="poste"]', 1, method='xpath').text description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath') advert.description = html2text(self.parser.tostring(description)) titresmenuG = self.document.getroot().xpath('//div[@id="divmenuGauche"]')[0] contract_type = self.parser.select(titresmenuG, '//span[@itemprop="employmentType"]', method='xpath') if len(contract_type) != 0: advert.contract_type = u'%s' % contract_type[0].text_content() return self.fill_advert(advert, titresmenuG)
def get_biography(self): bio = unicode() start = False tn = self.parser.select(self.document.getroot(), 'div#bio_content', 1) for el in tn.getchildren(): if el.attrib.get('name') == 'mini_bio': start = True if start: bio += html2text(self.parser.tostring(el)) return bio
def get_video(self, video=None): if not video: video = ArteLiveVideo(self.group_dict['id']) div = self.document.xpath('//div[@class="bloc-presentation"]')[0] description = self.parser.select(div, 'div[@class="field field-name-body field-type-text-with-summary field-label-hidden bloc-rte"]', 1, method='xpath') video.description = html2text(self.parser.tostring(description)) json_url = self.document.xpath('//div[@class="video-container"]')[0].attrib['arte_vp_url'] return json_url, video
def fill_normal_advert(self, advert, div): advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath') advert.description = html2text(self.parser.tostring(description)) jobsummary = self.document.getroot().xpath('//div[@id="jobsummary_content"]')[0] contract_type = self.parser.select(jobsummary, 'dl/dd[@class="multipleddlast"]/span', method='xpath') if len(contract_type) != 0: advert.contract_type = u'%s' % contract_type[0].text_content() society_name = self.parser.select(jobsummary, '//span[@itemprop="name"]', method='xpath') if len(society_name) != 0: advert.society_name = u'%s' % society_name[0].text_content() return self.fill_advert(advert, jobsummary)
def get_video(self, video=None): if not video: video = ArteLiveVideo('/%s' % self.group_dict['id']) div = self.document.xpath('//div[@class="bloc-presentation"]')[0] description = self.parser.select( div, 'div[@class="field field-name-body field-type-text-with-summary field-label-hidden bloc-rte"]', 1, method='xpath') video.description = html2text(self.parser.tostring(description)) json_url = self.document.xpath( '//div[@class="video-container"]')[0].attrib['arte_vp_url'] return json_url, video
def get_job_advert(self, url, advert): content = self.document.getroot().xpath('//div[@id="offre-body"]')[0] if not advert: _id = self.parser.select(content, 'div/div/ul/li/div[@class="value"]/span', 1, method='xpath').text advert = PopolemploiJobAdvert(_id) advert.title = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip() advert.job_name = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip() description = self.parser.select(content, 'p[@itemprop="description"]', 1, method='xpath') advert.description = html2text(self.parser.tostring(description)) society_name = self.parser.select(content, 'div[@class="vcard"]/p[@class="title"]/span', method='xpath') if society_name: advert.society_name = u'%s' % society_name[0].text advert.url = url place = u'%s' % self.parser.select(content, 'dl/dd/ul/li[@itemprop="addressRegion"]', 1, method='xpath').text advert.place = place.strip() contract_type = u'%s' % self.parser.select(content, 'dl/dd/span[@itemprop="employmentType"]', 1, method='xpath').text advert.contract_type = contract_type.strip() experience = u'%s' % self.parser.select(content, 'dl/dd/span[@itemprop="experienceRequirements"]', 1, method='xpath').text advert.experience = experience.strip() formation = u'%s' % self.parser.select(content, 'dl/dd/span[@itemprop="qualifications"]', 1, method='xpath').text advert.formation = formation.strip() pay = u'%s' % self.parser.select(content, 'dl/dd/span[@itemprop="baseSalary"]', 1, method='xpath').text advert.pay = pay.strip() return advert
def set_video_metadata(self, video): head = self.parser.select(self.document.getroot(), 'head', 1) video.title = unicode( self.parser.select(head, 'meta[property="og:title"]', 1).get("content")).strip() video.author = unicode( self.parser.select(head, 'meta[name="author"]', 1).get("content")).strip() url = unicode( self.parser.select(head, 'meta[property="og:image"]', 1).get("content")).strip() # remove the useless anti-caching url = re.sub('\?\d+', '', url) video.thumbnail = BaseImage(url) video.thumbnail.url = video.thumbnail.id try: parts = self.parser.select(head, 'meta[property="video:duration"]', 1).get("content").strip().split(':') except BrokenPageError: # it's probably a live, np. video.duration = NotAvailable else: if len(parts) == 1: seconds = parts[0] hours = minutes = 0 elif len(parts) == 2: minutes, seconds = parts hours = 0 elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError('Unable to parse duration %r' % parts) video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) try: video.description = html2text( self.parser.select(head, 'meta[property="og:description"]', 1).get("content")).strip() or unicode() except BrokenPageError: video.description = u''
def get_job_advert(self, url, advert): re_id = re.compile( 'http://www.adecco.fr/trouver-un-emploi/Pages/Details-de-l-Offre/(.*?)/(.*?).aspx\?IOF=(.*?)$', re.DOTALL) if advert is None: _id = u'%s/%s/%s' % (re_id.search(url).group(1), re_id.search(url).group(2), re_id.search(url).group(3)) advert = AdeccoJobAdvert(_id) advert.contract_type = re_id.search(url).group(1) div = self.document.getroot().xpath( "//div[@class='contain_MoreResults']")[0] date = u'%s' % self.parser.select( div, "div[@class='dateResult']", 1, method='xpath').text.strip() m = re.match('(\d{2})\s(.*?)\s(\d{4})', date) if m: dd = int(m.group(1)) mm = MONTHS.index(m.group(2)) + 1 yyyy = int(m.group(3)) advert.publication_date = datetime.date(yyyy, mm, dd) title = self.parser.select(div, "h1", 1, method='xpath').text_content().strip() town = self.parser.select(div, "h1/span/span[@class='town']", 1, method='xpath').text_content() page_title = self.parser.select(div, "h1/span[@class='pageTitle']", 1, method='xpath').text_content() advert.title = u'%s' % title.replace(town, '').replace(page_title, '') spans = self.document.getroot().xpath( "//div[@class='jobGreyContain']/table/tr/td/span[@class='value']") advert.job_name = u'%s' % spans[0].text advert.place = u'%s' % spans[1].text advert.pay = u'%s' % spans[2].text advert.contract_type = u'%s' % spans[3].text advert.url = url description = self.document.getroot().xpath( "//div[@class='descriptionContainer']/p")[0] advert.description = html2text(self.parser.tostring(description)) return advert
def get_job_advert(self, url, advert): re_id_title = re.compile( '/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)', re.DOTALL) if advert is None: _id = u'%s/%s' % (re_id_title.search(url).group(1), re_id_title.search(url).group(2)) advert = ApecJobAdvert(_id) advert.title = re_id_title.search(url).group(2).replace('-', ' ') description = self.document.getroot().xpath( "//div[@class='contentWithDashedBorderTop marginTop boxContent']/div" )[0] advert.description = html2text(self.parser.tostring(description)) advert.job_name = advert.title trs = self.document.getroot().xpath( "//table[@class='noFieldsTable']/tr") for tr in trs: th = self.parser.select(tr, 'th', 1, method='xpath') td = self.parser.select(tr, 'td', 1, method='xpath') if u'Date de publication' in u'%s' % th.text_content(): advert.publication_date = dateutil.parser.parse( td.text_content()).date() elif u'Société' in u'%s' % th.text_content( ) and not advert.society_name: society_name = td.text_content() a = self.parser.select(td, 'a', method='xpath') if a: advert.society_name = u'%s' % society_name.replace( a[0].text_content(), '').strip() else: advert.society_name = society_name.strip() elif u'Type de contrat' in u'%s' % th.text_content(): advert.contract_type = u'%s' % td.text_content().strip() elif u'Lieu' in u'%s' % th.text_content(): advert.place = u'%s' % td.text_content() elif u'Salaire' in u'%s' % th.text_content(): advert.pay = u'%s' % td.text_content() elif u'Expérience' in u'%s' % th.text_content(): advert.experience = u'%s' % td.text_content() advert.url = url return advert
def format_obj(self, obj, alias): if hasattr(obj, 'message') and obj.message: message = obj.message else: message = u'%s (%s)' % (obj.shop.name, obj.shop.location) result = u'%s%s%s\n' % (self.BOLD, message, self.NC) result += u'ID: %s\n' % obj.fullid result += u'Product: %s\n' % obj.product.name result += u'Cost: %s%s\n' % (obj.cost, obj.currency) if hasattr(obj, 'date') and obj.date: result += u'Date: %s\n' % obj.date.strftime('%Y-%m-%d') result += u'\n%sShop:%s\n' % (self.BOLD, self.NC) result += u'\tName: %s\n' % obj.shop.name if obj.shop.location: result += u'\tLocation: %s\n' % obj.shop.location if obj.shop.info: result += u'\n\t' + html2text(obj.shop.info).replace('\n', '\n\t').strip() return result
def format_obj(self, obj, alias): result = u'%s %s %s %s %s\n' % ( self.colored(obj.project.name, 'blue', 'bold'), self.colored(u'—', 'cyan', 'bold'), self.colored(obj.fullid, 'red', 'bold'), self.colored(u'—', 'cyan', 'bold'), self.colored(obj.title, 'yellow', 'bold')) result += '\n%s\n\n' % obj.body result += self.format_key('Author', '%s (%s)' % (obj.author.name, obj.creation)) result += self.format_attr(obj, 'status') result += self.format_attr(obj, 'priority') result += self.format_attr(obj, 'version') result += self.format_attr(obj, 'tracker') result += self.format_attr(obj, 'category') result += self.format_attr(obj, 'assignee') if hasattr(obj, 'fields') and not empty(obj.fields): for key, value in obj.fields.items(): result += self.format_key(key.capitalize(), value) if hasattr(obj, 'attachments') and obj.attachments: result += '\n%s\n' % self.colored('Attachments:', 'green') for a in obj.attachments: result += '* %s%s%s <%s>\n' % (self.BOLD, a.filename, self.NC, a.url) if hasattr(obj, 'history') and obj.history: result += '\n%s\n' % self.colored('History:', 'green') for u in obj.history: result += '%s %s %s %s\n' % ( self.colored('*', 'red', 'bold'), self.colored(u.date, 'yellow', 'bold'), self.colored(u'—', 'cyan', 'bold'), self.colored(u.author.name, 'blue', 'bold')) for change in u.changes: result += ' - %s %s %s %s\n' % ( self.colored(change.field, 'green'), change.last, self.colored('->', 'magenta'), change.new) if u.message: result += ' %s\n' % html2text( u.message).strip().replace('\n', '\n ') return result
def get_thread_mails(self): mails = { 'member': {}, 'messages': [], } try: mails['member']['pseudo'] = self.parser.tocleanstring( self.document.getroot().cssselect( '#message_heading div.username span.name')[0]) except IndexError: mails['member']['pseudo'] = 'Unknown' for li in reversed( self.document.xpath( '//ul[@id="thread"]//li[contains(@id, "message_")]')): try: txt = self.parser.tostring( li.xpath('.//div[@class="message_body"]')[0]) except IndexError: continue # 'Match' message txt = html2text(txt).strip() m = re.search( r'(\d+), ', li.xpath('.//span[@class="timestamp"]//script')[0].text) assert m date = local2utc(datetime.fromtimestamp(int(m.group(1)))) id_from = li.find('a').attrib['href'].split('/')[-1].split('?')[0] mails['messages'].append({ 'date': date, 'message': unicode(txt), 'id_from': unicode(id_from), }) return mails
def get_job_advert(self, url, advert): re_id_title = re.compile('/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)', re.DOTALL) if advert is None: _id = u'%s/%s' % (re_id_title.search(url).group(1), re_id_title.search(url).group(2)) advert = ApecJobAdvert(_id) advert.title = re_id_title.search(url).group(2).replace('-', ' ') description = self.document.getroot().xpath("//div[@class='contentWithDashedBorderTop marginTop boxContent']/div")[0] advert.description = html2text(self.parser.tostring(description)) advert.job_name = advert.title trs = self.document.getroot().xpath("//table[@class='noFieldsTable']/tr") for tr in trs: th = self.parser.select(tr, 'th', 1, method='xpath') td = self.parser.select(tr, 'td', 1, method='xpath') if u'Date de publication' in u'%s' % th.text_content(): advert.publication_date = dateutil.parser.parse(td.text_content()).date() elif u'Société' in u'%s' % th.text_content() and not advert.society_name: society_name = td.text_content() a = self.parser.select(td, 'a', method='xpath') if a: advert.society_name = u'%s' % society_name.replace(a[0].text_content(), '').strip() else: advert.society_name = society_name.strip() elif u'Type de contrat' in u'%s' % th.text_content(): advert.contract_type = u'%s' % td.text_content().strip() elif u'Lieu' in u'%s' % th.text_content(): advert.place = u'%s' % td.text_content() elif u'Salaire' in u'%s' % th.text_content(): advert.pay = u'%s' % td.text_content() elif u'Expérience' in u'%s' % th.text_content(): advert.experience = u'%s' % td.text_content() advert.url = url return advert
def send_email(self, backend_name, mail): domain = self.config.get('domain') recipient = self.config.get('recipient') parent_message = mail.parent references = [] while parent_message: references.append(u'<%s.%s@%s>' % (backend_name, mail.parent.full_id, domain)) parent_message = parent_message.parent subject = mail.title sender = u'"%s" <%s@%s>' % (mail.sender.replace('"', '""') if mail.sender else '', backend_name, domain) # assume that .date is an UTC datetime date = formatdate(time.mktime(utc2local(mail.date).timetuple()), localtime=True) msg_id = u'<%s.%s@%s>' % (backend_name, mail.full_id, domain) if self.config.get('html') and mail.flags & mail.IS_HTML: body = mail.content content_type = 'html' else: if mail.flags & mail.IS_HTML: body = html2text(mail.content) else: body = mail.content content_type = 'plain' if body is None: body = '' if mail.signature: if self.config.get('html') and mail.flags & mail.IS_HTML: body += u'<p>-- <br />%s</p>' % mail.signature else: body += u'\n\n-- \n' if mail.flags & mail.IS_HTML: body += html2text(mail.signature) else: body += mail.signature # Header class is smart enough to try US-ASCII, then the charset we # provide, then fall back to UTF-8. header_charset = 'ISO-8859-1' # We must choose the body charset manually for body_charset in 'US-ASCII', 'ISO-8859-1', 'UTF-8': try: body.encode(body_charset) except UnicodeError: pass else: break # Split real name (which is optional) and email address parts sender_name, sender_addr = parseaddr(sender) recipient_name, recipient_addr = parseaddr(recipient) # We must always pass Unicode strings to Header, otherwise it will # use RFC 2047 encoding even on plain ASCII strings. sender_name = str(Header(unicode(sender_name), header_charset)) recipient_name = str(Header(unicode(recipient_name), header_charset)) # Make sure email addresses do not contain non-ASCII characters sender_addr = sender_addr.encode('ascii') recipient_addr = recipient_addr.encode('ascii') # Create the message ('plain' stands for Content-Type: text/plain) msg = MIMEText(body.encode(body_charset), content_type, body_charset) msg['From'] = formataddr((sender_name, sender_addr)) msg['To'] = formataddr((recipient_name, recipient_addr)) msg['Subject'] = Header(unicode(subject), header_charset) msg['Message-Id'] = msg_id msg['Date'] = date if references: msg['In-Reply-To'] = references[0] msg['References'] = u" ".join(reversed(references)) self.logger.info('Send mail from <%s> to <%s>' % (sender, recipient)) if len(self.config.get('pipe')) > 0: p = subprocess.Popen(self.config.get('pipe'), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) p.stdin.write(msg.as_string()) p.stdin.close() if p.wait() != 0: self.logger.error('Unable to deliver mail: %s' % p.stdout.read().strip()) return False else: # Send the message via SMTP to localhost:25 try: smtp = SMTP(self.config.get('smtp')) smtp.sendmail(sender, recipient, msg.as_string()) except Exception as e: self.logger.error('Unable to deliver mail: %s' % e) return False else: smtp.quit() return True
def test_lefigaro(self): l = list(self.backend.iter_threads()) assert len(l) thread = self.backend.get_thread(l[0].id) assert len(thread.root.content) assert len(html2text(thread.root.content))
def clean(cls, txt, options=None): if not isinstance(txt, basestring): txt = html.tostring(txt, encoding=unicode) options = options or {} return html2text(txt, **options)
def clean(cls, txt): if not isinstance(txt, basestring): txt = html.tostring(txt, encoding=unicode) return html2text(txt)
def get_torrent(self, id): table = self.browser.parser.select(self.document.getroot(), 'div.thin', 1) h2 = table.xpath('.//h2') if len(h2) > 0: title = u''.join([txt.strip() for txt in h2[0].itertext()]) else: title = self.browser.parser.select(table, 'div.title_text', 1).text torrent = Torrent(id, title) torrentid = id.split('.', 1)[1] if '.' in id else id table = self.browser.parser.select(self.document.getroot(), 'table.torrent_table') if len(table) == 0: table = self.browser.parser.select(self.document.getroot(), 'div.main_column', 1) is_table = False else: table = table[0] is_table = True for tr in table.findall('tr' if is_table else 'div'): if is_table and 'group_torrent' in tr.attrib.get('class', ''): tds = tr.findall('td') if len(tds) != 5: continue url = tds[0].find('span').find('a').attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: warning('ID not found') continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tds[1].text.split() torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int(tds[3].text) torrent.leechers = int(tds[4].text) break elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') \ and tr.attrib.get('class', '').endswith('pad'): url = tr.cssselect('a[title=Download]')[0].attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: warning('ID not found') continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tr.cssselect( 'div.details_title strong')[-1].text.strip('()').split() torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int( tr.cssselect('img[title=Seeders]')[0].tail) torrent.leechers = int( tr.cssselect('img[title=Leechers]')[0].tail) break if not torrent.url: warning('Torrent %s not found in list' % torrentid) return None div = self.parser.select(self.document.getroot(), 'div.main_column', 1) for box in div.cssselect('div.box'): title = None body = None title_t = box.cssselect('div.head') if len(title_t) > 0: title_t = title_t[0] if title_t.find('strong') is not None: title_t = title_t.find('strong') if title_t.text is not None: title = title_t.text.strip() body_t = box.cssselect('div.body,div.desc') if body_t: body = html2text(self.parser.tostring(body_t[-1])).strip() if title and body: if torrent.description is NotLoaded: torrent.description = u'' torrent.description += u'%s\n\n%s\n' % (title, body) divs = self.document.getroot().cssselect( 'div#files_%s,div#filelist_%s,tr#torrent_%s td' % (torrentid, torrentid, torrentid)) if divs: torrent.files = [] for div in divs: table = div.find('table') if table is None: continue for tr in table: if tr.attrib.get('class', None) != 'colhead_dark': torrent.files.append(tr.find('td').text) return torrent
def get_error_message(self, error): key = 'app.identification.erreur.' + str(error) try: return html2text(self.doc[key]) except KeyError: return None
def get_value(self, profile, consts): return html2text(unicode(profile[self.key])).strip()
def get_torrent(self, id): table = self.browser.parser.select(self.document.getroot(), 'div.thin', 1) h2 = table.xpath('.//h2') if len(h2) > 0: title = u''.join([txt.strip() for txt in h2[0].itertext()]) else: title = self.browser.parser.select(table, 'div.title_text', 1).text torrent = Torrent(id, title) if '.' in id: torrentid = id.split('.', 1)[1] else: torrentid = id table = self.browser.parser.select(self.document.getroot(), 'table.torrent_table') if len(table) == 0: table = self.browser.parser.select(self.document.getroot(), 'div.main_column', 1) is_table = False else: table = table[0] is_table = True for tr in table.findall('tr' if is_table else 'div'): if is_table and 'group_torrent' in tr.attrib.get('class', ''): tds = tr.findall('td') if not len(tds) == 5: continue url = tds[0].find('span').find('a').attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: warning('ID not found') continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tds[1].text.split() torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int(tds[3].text) torrent.leechers = int(tds[4].text) break elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') \ and tr.attrib.get('class', '').endswith('pad'): url = tr.cssselect('a[title=Download]')[0].attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: warning('ID not found') continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tr.cssselect('div.details_title strong')[-1].text.strip('()').split() torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int(tr.cssselect('img[title=Seeders]')[0].tail) torrent.leechers = int(tr.cssselect('img[title=Leechers]')[0].tail) break if not torrent.url: warning('Torrent %s not found in list' % torrentid) return None div = self.parser.select(self.document.getroot(), 'div.main_column', 1) for box in div.cssselect('div.box'): title = None body = None title_t = box.cssselect('div.head') if len(title_t) > 0: title_t = title_t[0] if title_t.find('strong') is not None: title_t = title_t.find('strong') if title_t.text is not None: title = title_t.text.strip() body_t = box.cssselect('div.body,div.desc') if body_t: body = html2text(self.parser.tostring(body_t[-1])).strip() if title and body: if torrent.description is NotLoaded: torrent.description = u'' torrent.description += u'%s\n\n%s\n' % (title, body) divs = self.document.getroot().cssselect('div#files_%s,div#filelist_%s,tr#torrent_%s td' % (torrentid, torrentid, torrentid)) if divs: torrent.files = [] for div in divs: table = div.find('table') if table is None: continue for tr in table: if tr.attrib.get('class', None) != 'colhead_dark': torrent.files.append(tr.find('td').text) return torrent
def on_load(self): # This aims to track input errors. script_error = CleanText(u"//script[contains(text(), 'if (\"true\"===\"true\")')]")(self.doc) if script_error: raise TransferBankError(CleanText().filter(html2text(re.search(u'\.html\("(.*?)"\)', script_error).group(1))))