def scrape(self, url=None): if url is None: url = self.initial_url g = Grab() g.go(url) news_blocks = g.doc.select("//li[contains(@class, 'river-block')]") for news_block in news_blocks: self.count += 1 if self.count >= self.limit: return try: news_link = news_block.select('.//h2/a')[0] news_title = news_link.text() news_href = g.make_url_absolute(news_link.attr('href')) except IndexError as e: continue if dbhelper.get_entries({'href': news_href}).count(): print("News %s already exists" % news_href) continue news_grab = Grab() news_grab.go(news_href) try: news_text = news_grab.doc.select('.//div[contains(@class, "article-entry")]')[0].text(smart=True) except IndexError as e: news_text = '' print('%s: %s' % (news_title, news_href)) try: id = dbhelper.save_entry({ "href": news_href, "title": news_title, "text": news_text }) print(str(id)) except DuplicateKeyError as e: print("News %s already exists" % news_href) if self.count < self.limit: try: next_page_url = g.make_url_absolute(g.doc.select('//ol[contains(@class, "pagination")]//li[contains(@class, "next")]//a')[0].attr('href')) self.scrape(next_page_url) except IndexError as e: print('No more news')
def task_initial(self, grab: Grab, task: Task): from web.models import SentUrls send_url = self.meta.get('send_url') teleuser = self.meta.get('teleuser') if_newuser = self.meta.get('if_newuser') for url in grab.doc.select( ".//*[@id='offers_table']//*/td[1]/a/@href"): # if url was already sent to user - break parsing if SentUrls.objects.filter(teleuser=teleuser, url=url.text().split('.html')[0] + '.html').exists(): print('Новых объявлений нет') return # send url to subscribed user send_url(url.text()) # if user just subscribed than send to him last ad in category and break if if_newuser: return try: next_page = grab.doc.select( ".//*[contains(@class, 'next')]/a[contains(@class, 'pageNextPrev')]/@href" ).one().text() yield Task('initial', url=grab.make_url_absolute(next_page)) except IndexError as e: pass
def feed_http(request): """HTTP Cloud Function. Args: request (flask.Request): The request object. <http://flask.pocoo.org/docs/1.0/api/#flask.Request> Returns: The response text, or any set of values that can be turned into a Response object using `make_response` <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>. """ request_args = request.args url = request_args['url'] g = Grab() fg = FeedGenerator() g.go(url) fg.id(url) fg.title('Rabota.UA | rss feed') url_parsed = urlparse(g.response.url) fg.link(href=url_parsed.scheme + '://' + url_parsed.hostname, rel='alternate') fg.description(g.doc('/html/head/title').text()) count = int( g.doc('//span[@id="ctl00_content_vacancyList_ltCount"]/span').one(). text()) if count == 0: itm_list = [] else: articles = g.doc.select( '//table[contains(@class, "f-vacancylist-tablewrap")]').one() itm_list = articles.select( 'tr[@id]/td/article/div[contains(@class, "card-body")]') for item in itm_list: vac_title = item.select( 'div[1]//h2[contains(@class, "card-title")]/a/@title').text( ).strip() vac_url = g.make_url_absolute( item.select( 'div[1]//h2[contains(@class, "card-title")]/a/@href').text()) try: vac_description = item.select( 'div[contains(@class, "card-description")]').text().strip() except weblib.error.DataNotFound: vac_description = 'N/A' fe = fg.add_entry() print(vac_title) fe.id(vac_url) fe.link({'href': vac_url}) fe.source(vac_url) fe.title(vac_title) fe.description(vac_description) response = make_response(fg.atom_str(pretty=True, extensions=False)) response.headers['Content-Type'] = 'application/rss+xml; charset=UTF-8' return response
g = Grab() g.go('https://github.com/login') print g.doc.form g.doc.set_input('login', '*****@*****.**') g.doc.set_input('password', '') g.doc.submit() g.doc.save('/tmp/x.html') home_url = g.doc('//a[contains(@class, "header-nav-link name")]/@href').text() repo_url = home_url + '?tab=repositories' g.go(repo_url) for elem in g.doc.select('//h3[@class="repo-list-name"]/a'): print('%s: %s' % (elem.text(), g.make_url_absolute(elem.attr('href')))) # from grab.spider import Spider, Task # import logging # # class ExampleSpider(Spider): # def task_generator(self): # for lang in ('python', 'ruby', 'perl'): # url = 'https://www.google.com/search?q=%s' % lang # yield Task('search', url=url, lang=lang) # # def task_search(self, grab, task): # print('%s: %s' % (task.lang,
g.go(repo_url) g.doc.save('x.html') max_page = 0 for elem in g.doc.select('//div[@class="paginate-container"]/div/a'): text = elem.text() if (is_number_regex(text)): pag_num = int(text) if (max_page < pag_num): max_page = pag_num print('---------------------------------------------------') print('Found ' + str(max_page) + ' pages.') print('---------------------------------------------------') text_file = open("MyStars.md", "w") text_file.write("# My stars #\n") for current_page in range(1, max_page + 1): repo_url = 'https://github.com/' + your_login + '?page=' + str( current_page) + '&tab=stars' print('---------------------------------------------------') print('Processing ' + str(current_page) + ' page.') print('---------------------------------------------------') g.go(repo_url) for elem in g.doc.select('//div[@class="d-inline-block mb-1"]/h3/a'): text_file.write("- [{1}](\"{0}\")\n".format( g.make_url_absolute(elem.attr('href')), elem.text())) text_file.close()