def get_sbl_items(self): try: page = self.s.get('http://spamhaus.org/sbl/latest/', headers=self.h) document = leaf.parse(page.text) items = document.xpath('body/div/table[2]/tr[3]/td[2]/table') if not items: c = document.xpath('body/div/table[2]/tr/td[2]/h1')[0] if c: raise Exception(c) raise Exception('table items not found') for i in items: try: self.get_sbl_item(i) except Exception as e: sbl_parser_log(e) # break except Exception as e: sbl_parser_log(e) reload_tor() self.get_sbl_items() for item in self.sbl_items: obj = SblItem.objects.get_or_create(ref_name=item['ref_name'])[0] for key in item: setattr(obj, key, item[key]) obj.save() sbl_parser_log('added %s sbl items' % len(self.sbl_items))
def run(self): self.session.get(thread_url) while True: try: sleep(10) poll = self.session.get(thread_url) if '#lastpost' in poll.url: continue if self.dead >= dead_threshold - 1: continue else: self.dead += 1 new_index = int(get_new_post_number.findall(poll.url)[0]) - 1 page = leaf.parse(poll.text) self.callback(self.channel, 'Forums {0} {1} made {2} new post in the NYC thread! {3}'.format( choice(user_adjectives) if random() < chance_of_using_a_funny_adjective else 'poster', page('.author')[new_index].text, choice(post_adjectives) if random() < chance_of_using_a_funny_adjective else 'a', poll.url )) except Exception as e: print e
def get_elements_by_css(url, css_selector): f = urllib.urlopen(url) html = f.read() if html: doc = leaf.parse(html) elements = doc(css_selector) return elements return []
def test_bbcode(): document = leaf.parse(sample) bbcode = document.parse(bbcode_formatter, 'http://example.com/') bbcode = leaf.strip_spaces(bbcode) bbcode = leaf.strip_symbols(bbcode) bbcode = leaf.strip_linebreaks(bbcode) assert bbcode == leaf.to_unicode(sample_result), "Sample bbcode formatter"
def test_selectors(): document = leaf.parse(sample) links = document('div#menu a') assert links[-1].text == 'Contacts', "Access by id and element type" links2 = document('div#menu li a') assert links2[-1].text == ' Test link 5', "Access by id and element type 2" assert len(document('a')) == 9 assert document('li.active_link a')[0].text == ' Test link 5', "Access by class"
def test_selectors(): document = leaf.parse(sample) links = document('div#menu a') assert links[-1].text == 'Contacts', "Access by id and element type" links2 = document('div#menu li a') assert links2[-1].text == ' Test link 5', "Access by id and element type 2" assert len(document('a')) == 9 assert document( 'li.active_link a')[0].text == ' Test link 5', "Access by class"
def getSteamDeals(): deals = [] page = requests.get('http://store.steampowered.com/search/?sort_by=Metascore&sort_order=DESC&specials=1') temp = leaf.parse(page.text) links = temp('.search_result_row.even') + temp('.search_result_row.odd') for link in links: deals.append(Deal(link('.col.search_name.ellipsis')[0]('h4')[0].text, link('.col.search_price')[0]('br')[0].tail, link('.col.search_price')[0]('strike')[0].text)) return deals
def leaf(self): """ Return body parsed by leaf """ import leaf if not self._leaf: self._leaf = leaf.parse(self.response.unicode_body(), encoding=self.charset) return self._leaf
def GetImageUrl(page): doc = leaf.parse(page) data = doc(".issue img") logging.info(data) logging.info("Found img links" + str(len(data))) if len(data) == 1: img_block = data[0] if hasattr(img_block, "src"): logging.info(data[0].src) return data[0].src
def test_attribs(): document = leaf.parse(sample) first_link = document.get('div#menu li') assert document.get('div#menu a', 4).text == ' Test link 5', "Get element by index" assert document.get('div#menu a', 99, default='blah') == 'blah', "Custom default value for get" assert bool(document.get('div#menu li')) == True, "Node bool" assert bool(document.get('div#menu_test li')) == False, "Node bool" assert isinstance(first_link, leaf.Parser), "Get first element" assert first_link.id == 'first_link', "Id attrib" assert first_link.onclick == "alert('test')", "Onclick attrib" first_link.onclick = 'blah()' assert first_link.onclick == 'blah()', "Attribute modification"
def on_new_comment(self, io, data, *ex_prms): try: username = data['lastpostuser'] user_id = data['lastpostuserid'] if user_id == self.user.user_id: return ''' r = self.user.sess.get('https://www.fxp.co.il/showthread.php', params={ 't': data['id'], 'page': data['pages'], 'web_fast_fxp': 1 }) # comment = document.xpath(f'//div[@class="user-pic-holder user_pic_{user_id}"]/../../../../..')[-1] ''' # new way r = self.user.sess.get('https://www.fxp.co.il/showthread.php', params={ 't': data['id'], 'pp': 1, 'page': data['posts'] + 1, 'web_fast_fxp': 1 }) forum_id = int( re.search(r'FORUM_ID_FXP\s*=\s*"(.+?)"', r.text).group(1)) document = leaf.parse(r.text) comment = document.xpath(f'//ol[@id="posts"]//li')[0] comment_content = comment.xpath( './/blockquote[@class="postcontent restore "]')[0] comment_id = int(comment.id.replace('post_', '')) parsed_content = comment_content.parse( self.bbcode_formatter).strip() quoted_me = self.is_quoted_me(comment_content) self.events.emit( FxpComment, FxpComment(username=username, user_id=user_id, id=int(comment_id), content=parsed_content, thread_id=int(data['id']), thread_title=data['title'], posts_number=int(data['posts']), forum_id=forum_id, quoted_me=quoted_me)) except Exception as e: # raise pass
def test_attribs(): document = leaf.parse(sample) first_link = document.get('div#menu li') assert document.get('div#menu a', 4).text == ' Test link 5', "Get element by index" assert document.get('div#menu a', 4).__unicode__() == ' Test link 5', "Unicode test on python2" assert document.get('div#menu a', 99, default='blah') == 'blah', "Custom default value for get" assert document.find('body/div[4]').tag == 'div', "ETree find" assert bool(document.get('div#menu li')), "Node bool" assert bool(document.get('div#menu_test li')) is False, "Node bool" assert isinstance(first_link, leaf.Parser), "Get first element" assert first_link.id == 'first_link', "Id attrib" assert first_link.onclick == "alert('test')", "Onclick attrib" first_link.onclick = 'blah()' assert first_link.onclick == 'blah()', "Attribute modification"
def processObject(self, o): if not Post.objects.filter(body__iexact=unicode_fix(o.post_text)).exists(): author = self._getAuthor(o, o.poster) created = datetime.datetime.fromtimestamp(o.post_time) try: topic = Topic.objects.get(name=unicode_fix(o.topic.topic_title)) except Topic.DoesNotExist: topic = Topic.objects.get(name__icontains=unicode_fix(o.topic.topic_title)) text = self._process_text(unicode_fix(o.post_text)) doc = leaf.parse(text) text = doc.parse(bbcode_formatter, self._originalAddress) Post(topic=topic, body=text, user_ip=o.poster_ip, user=author, created=created).save()
def test_attribs(): document = leaf.parse(sample) first_link = document.get('div#menu li') assert document.get('div#menu a', 4).text == ' Test link 5', "Get element by index" assert document.get( 'div#menu a', 99, default='blah') == 'blah', "Custom default value for get" assert bool(document.get('div#menu li')) == True, "Node bool" assert bool(document.get('div#menu_test li')) == False, "Node bool" assert isinstance(first_link, leaf.Parser), "Get first element" assert first_link.id == 'first_link', "Id attrib" assert first_link.onclick == "alert('test')", "Onclick attrib" first_link.onclick = 'blah()' assert first_link.onclick == 'blah()', "Attribute modification"
def get_schedule(): html = urllib2.urlopen("http://codefest.ru/program/2011-03/").read() doc = leaf.parse(html) calendar = get_calendar() programs = doc("table.program tbody") section = 0 day = 0 for program in programs: talks = program("tr") for talk in talks: time_tag = talk("td")[0] about_tag = talk("td")[-1] topic_tag = talk.get("td a") start = (None, None) end = (None, None) topic = None topic_about = None speaker = None name = None if time_tag is not None: time = time_tag.text r = re.search(u".*(\d\d):(\d\d).*(\d\d):(\d\d).*", time) if r is not None: start = (int(r.group(1)), int(r.group(2))) end = (int(r.group(3)), int(r.group(4))) if topic_tag is not None: topic = topic_tag.text topic_about = topic_tag.href if about_tag is not None: name = about_tag.text print time_tag.text, topic, topic_about, DAYS[day][section] add_event(calendar, start, end, name, topic, topic_about, day, DAYS[day][section]) section += 1 if len(DAYS[day]) == section: day += 1 section = 0
def test_attribs(): document = leaf.parse(sample) first_link = document.get('div#menu li') assert document.get('div#menu a', 4).text == ' Test link 5', "Get element by index" assert document.get( 'div#menu a', 4).__unicode__() == ' Test link 5', "Unicode test on python2" assert document.get( 'div#menu a', 99, default='blah') == 'blah', "Custom default value for get" assert document.find('body/div[4]').tag == 'div', "ETree find" assert bool(document.get('div#menu li')), "Node bool" assert bool(document.get('div#menu_test li')) is False, "Node bool" assert isinstance(first_link, leaf.Parser), "Get first element" assert first_link.id == 'first_link', "Id attrib" assert first_link.onclick == "alert('test')", "Onclick attrib" first_link.onclick = 'blah()' assert first_link.onclick == 'blah()', "Attribute modification"
def processObject(self, o): if not Post.objects.filter( body__iexact=unicode_fix(o.post_text)).exists(): author = self._getAuthor(o, o.poster) created = datetime.datetime.fromtimestamp(o.post_time) try: topic = Topic.objects.get( name=unicode_fix(o.topic.topic_title)) except Topic.DoesNotExist: topic = Topic.objects.get( name__icontains=unicode_fix(o.topic.topic_title)) text = self._process_text(unicode_fix(o.post_text)) doc = leaf.parse(text) text = doc.parse(bbcode_formatter, self._originalAddress) Post(topic=topic, body=text, user_ip=o.poster_ip, user=author, created=created).save()
def get_forum_threads(self, forum_id, page=1, post_per_page=25): """Get list of the threads in the forum Args: forum_id (int): Forum id. page (int): Page number. post_per_page (int) Posts per page (MAX=200). Returns: list: List of ids. """ r = self.sess.get('https://www.fxp.co.il/forumdisplay.php', params={ 'f': forum_id, 'page': page, 'pp': post_per_page, 'web_fast_fxp': 1 }) return [ int(thread_id.replace('thread_', '')) for thread_id in leaf.parse( r.text).xpath(f'//ul[@id="threads"]//li/@id') ]
def on_new_thread(self, io, data, *ex_prms): try: if data['poster'] == self.user.user_id: return r = self.user.sess.get('https://www.fxp.co.il/showthread.php', params={ 't': data['id'], 'web_fast_fxp': 1 }) forum_id = int( re.search(r'FORUM_ID_FXP\s*=\s*"(.+?)"', r.text).group(1)) document = leaf.parse(r.text) thread_content = document.xpath( './/blockquote[@class="postcontent restore simple"]')[0] comment_id = int(thread_content.getparent().id.replace( 'post_message_', '')) quoted_me = self.is_quoted_me(thread_content) parsed_content = thread_content.parse( self.bbcode_formatter).strip() self.events.emit( FxpThread, FxpThread(username=data['username'], user_id=data['poster'], id=data['id'], title=data['title'], content=parsed_content, comment_id=comment_id, prefix=data['prefix'], forum_id=forum_id, quoted_me=quoted_me)) except Exception as e: # print(e) pass
def get_sbl_item(self, i): sbl_item = {} status = i.xpath('tr/td[1]/img')[0].src.replace('/images/', '').replace('.gif', '') if status == 'spacer': status = i.xpath('tr/td[last()]/div/img')[0].src.replace('/images/', '').replace('.gif', '') sbl_item.update({'status': status}) date = i.xpath('tr[2]/td[1]/span')[0].text sbl_item.update({'date': date}) ref = i.xpath('tr/td[2]/span')[0].get('a') if ref: sbl_item.update({'ref_href': ref.href}) page = self.s.get('http://spamhaus.org%s' % sbl_item['ref_href'], headers=self.h) document = leaf.parse(page.text) detail_text = document.xpath('body/div/table[2]/tr[2]/td[2]')[0] # detail_text_data = document.xpath('body/div/table[2]/tr[2]/td[2]/table/tr[3]/td')[0] # sbl_item.update({'date': detail_text_data.get('span').text.replace('|', '').strip()}) sbl_item.update({'ref_detail_text': detail_text.inner_html()}) ref_name = ref.get('b').text else: sbl_item.update({'ref_href': None}) ref_name = i.xpath('tr/td[2]/span/b/font')[0].text sbl_item.update({'ref_name': ref_name}) network = i.xpath('tr/td[3]/span')[0].text sbl_item.update({'network': network}) domen = i.xpath('tr/td[4]/span')[0].get('a').text sbl_item.update({'domen': domen}) ptext = i.xpath('tr[2]/td[2]/span')[0].text sbl_item.update({'ptext': ptext}) self.sbl_items.append(sbl_item)
def like(self, comment_id): """Like comment Args: comment_id (str/int): The id of the comment. Returns: bool: True for success, False otherwise. """ r = self.sess.post('https://www.fxp.co.il/ajax.php', data={ 'do': 'add_like', 'postid': comment_id, 'securitytoken': self.securitytoken }) r = self.sess.get( f'https://www.fxp.co.il/showthread.php#post{comment_id}', params={'p': comment_id}) # ----------- fix this pls ----------- will return true if the comment doesnt exists ---------- return leaf.parse( r.text).xpath(f'//span[@id="{comment_id}_removelike"]') == []
def test_inner_html(): html = '''<div>xxx <!-- comment --> yyy <p>foo</p> zzz</div>''' dom = leaf.parse(html) assert dom.inner_html() == 'xxx <p>foo</p> zzz'
def test_inner_methods(): document = leaf.parse(sample) link = document.xpath('body/div/ul/li[@class="active_link"]')[0] assert link.get('a').text == ' Test link 5', 'XPath by inner lxml method'
def test_html(): document = leaf.parse(sample) link = document.get('div#content li.link2') assert link.html() == '<li class="link2"><a href="#3"> Test link3</a></li>\n\t\t', "Convert element to html code"
def get_document(raw_html_schedule): return leaf.parse(leaf.strip_symbols(leaf.strip_accents(show(raw_html_schedule))))
def get_document(raw_html_schedule): return leaf.parse( leaf.strip_symbols(leaf.strip_accents(show(raw_html_schedule))))
def test_html(): document = leaf.parse(sample) link = document.get('div#content li.link2') assert link.html( ) == '<li class="link2"><a href="#3"> Test link3</a></li>\n\t\t', "Convert element to html code"
def leaf(self): if not getattr(self, '_leaf', None): self._leaf = parse(self.body, encoding=self.charset) return self._leaf
def load(self, store_data = True, date_limit=None, run_agent=False): for data_src in self.data_sources: print "Loading data from: %s" % data_src # init variables from the data source url = data_src.src_id source_node = data_src parameters = data_src.get_parameters() username = parameters.get('username','*****@*****.**') psw = parameters.get('password','choirpassword') article_css_selector = parameters.get('article-css-selector','') fetch_limit = parameters.get('fetch-limit',None) auth = ClientAuthMethod(username,psw) reader = GoogleReader(auth) if reader.buildSubscriptionList(): feeds = reader.getSubscriptionList() new_tag = DataTag.objects.get(name='new') new_datas = [] fetch_count = 0 # loop through and store feeds we already have RawData for for feed in feeds: if not fetch_limit: fetch_limit = feed.unread read_items = [] print "Reading " + feed.title + " (%s unread)" % feed.unread print "====================================================" print print "Loading items" print feed.loadItems() print "Loaded %s items" % (len(feed.items),) print index = 0 for item in feed.items: # make sure it doesn't already exist title = item.title url = item.url index+=1 if index + 1 >= len(feed.items) and fetch_count < fetch_limit: print "Loading more items...." print feed.loadMoreItems() f = urllib.urlopen(url) html = f.read() doc = leaf.parse(html) elements = doc(article_css_selector) for element in elements: # print article_html = element.html() new_data = RawData() new_data.title = title new_data.source = source_node new_data.data = strip_tags(article_html) new_data.data_id = item.id new_data.link = item.url try: new_data.occurred_at = datetime.datetime.fromtimestamp(feed.lastUpdated) except ValueError: # print "Error, could not parse timestamp: %s" % feed.lastUpdated new_data.occurred_at = datetime.datetime.now() # patching in date limit thing Parris wanted -------------------------- # if date_limit is None: # date_limit = datetime.date.today() - datetime.timedelta(week=1) # # if new_data.occured_at < date_limit: # # we should skip this item .... it is too old # continue # # end patch ----------------------------------------------------------- # Abandonning this idea for now ... I think it's best to patch the map view and not mess with this for now # if it is not new... save it if not new_data.exists(): print " + Saving article: %s" % new_data.title new_data.save() new_data.tags.add(new_tag) new_datas.append(new_data) fetch_count +=1 read_items.append(item) # print "All done.\n %s items fetched, our limit is %s. There are %s feeds. We stopped at index %s" % (fetch_count, self.fetch_limit, len(feed.items),index) if new_datas and run_agent: gra = GoogleReaderAgent() gra.search(raw_data_set = new_datas) return new_datas return None