Beispiel #1
0
    def scrape(self, url=None):
        if url is None:
            url = self.initial_url
        g = Grab()
        g.go(url)

        news_blocks = g.doc.select("//li[contains(@class, 'river-block')]")

        for news_block in news_blocks:
            self.count += 1
            if self.count >= self.limit:
                return

            try:
                news_link = news_block.select('.//h2/a')[0]
                news_title = news_link.text()
                news_href = g.make_url_absolute(news_link.attr('href'))
            except IndexError as e:
                continue

            if dbhelper.get_entries({'href': news_href}).count():
                print("News %s already exists" % news_href)
                continue

            news_grab = Grab()
            news_grab.go(news_href)

            try:
                news_text = news_grab.doc.select('.//div[contains(@class, "article-entry")]')[0].text(smart=True)
            except IndexError as e:
                news_text = ''

            print('%s: %s' % (news_title, news_href))

            try:
                id = dbhelper.save_entry({
                    "href": news_href,
                    "title": news_title,
                    "text": news_text
                })

                print(str(id))
            except DuplicateKeyError as e:
                print("News %s already exists" % news_href)

        if self.count < self.limit:
            try:
                next_page_url = g.make_url_absolute(g.doc.select('//ol[contains(@class, "pagination")]//li[contains(@class, "next")]//a')[0].attr('href'))
                self.scrape(next_page_url)
            except IndexError as e:
                print('No more news')
Beispiel #2
0
 def task_initial(self, grab: Grab, task: Task):
     from web.models import SentUrls
     send_url = self.meta.get('send_url')
     teleuser = self.meta.get('teleuser')
     if_newuser = self.meta.get('if_newuser')
     for url in grab.doc.select(
             ".//*[@id='offers_table']//*/td[1]/a/@href"):
         # if url was already sent to user - break parsing
         if SentUrls.objects.filter(teleuser=teleuser,
                                    url=url.text().split('.html')[0] +
                                    '.html').exists():
             print('Новых объявлений нет')
             return
         # send url to subscribed user
         send_url(url.text())
         # if user just subscribed than send to him last ad in category and break
         if if_newuser:
             return
     try:
         next_page = grab.doc.select(
             ".//*[contains(@class, 'next')]/a[contains(@class, 'pageNextPrev')]/@href"
         ).one().text()
         yield Task('initial', url=grab.make_url_absolute(next_page))
     except IndexError as e:
         pass
Beispiel #3
0
def feed_http(request):
    """HTTP Cloud Function.
    Args:
        request (flask.Request): The request object.
        <http://flask.pocoo.org/docs/1.0/api/#flask.Request>
    Returns:
        The response text, or any set of values that can be turned into a
        Response object using `make_response`
        <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>.
    """
    request_args = request.args
    url = request_args['url']
    g = Grab()
    fg = FeedGenerator()
    g.go(url)

    fg.id(url)
    fg.title('Rabota.UA | rss feed')
    url_parsed = urlparse(g.response.url)
    fg.link(href=url_parsed.scheme + '://' + url_parsed.hostname,
            rel='alternate')
    fg.description(g.doc('/html/head/title').text())
    count = int(
        g.doc('//span[@id="ctl00_content_vacancyList_ltCount"]/span').one().
        text())
    if count == 0:
        itm_list = []
    else:
        articles = g.doc.select(
            '//table[contains(@class, "f-vacancylist-tablewrap")]').one()
        itm_list = articles.select(
            'tr[@id]/td/article/div[contains(@class, "card-body")]')
    for item in itm_list:
        vac_title = item.select(
            'div[1]//h2[contains(@class, "card-title")]/a/@title').text(
            ).strip()
        vac_url = g.make_url_absolute(
            item.select(
                'div[1]//h2[contains(@class, "card-title")]/a/@href').text())
        try:
            vac_description = item.select(
                'div[contains(@class, "card-description")]').text().strip()
        except weblib.error.DataNotFound:
            vac_description = 'N/A'
        fe = fg.add_entry()
        print(vac_title)
        fe.id(vac_url)
        fe.link({'href': vac_url})
        fe.source(vac_url)
        fe.title(vac_title)
        fe.description(vac_description)

    response = make_response(fg.atom_str(pretty=True, extensions=False))
    response.headers['Content-Type'] = 'application/rss+xml; charset=UTF-8'
    return response
Beispiel #4
0
g = Grab()
g.go('https://github.com/login')
print g.doc.form
g.doc.set_input('login', '*****@*****.**')
g.doc.set_input('password', '')
g.doc.submit()
g.doc.save('/tmp/x.html')


home_url = g.doc('//a[contains(@class, "header-nav-link name")]/@href').text()
repo_url = home_url + '?tab=repositories'

g.go(repo_url)
for elem in g.doc.select('//h3[@class="repo-list-name"]/a'):
    print('%s: %s' % (elem.text(),
                      g.make_url_absolute(elem.attr('href'))))




# from grab.spider import Spider, Task
# import logging
#
# class ExampleSpider(Spider):
#     def task_generator(self):
#         for lang in ('python', 'ruby', 'perl'):
#             url = 'https://www.google.com/search?q=%s' % lang
#             yield Task('search', url=url, lang=lang)
#
#     def task_search(self, grab, task):
#         print('%s: %s' % (task.lang,
Beispiel #5
0
g.go(repo_url)
g.doc.save('x.html')

max_page = 0
for elem in g.doc.select('//div[@class="paginate-container"]/div/a'):
    text = elem.text()
    if (is_number_regex(text)):
        pag_num = int(text)
        if (max_page < pag_num):
            max_page = pag_num

print('---------------------------------------------------')
print('Found ' + str(max_page) + ' pages.')
print('---------------------------------------------------')

text_file = open("MyStars.md", "w")

text_file.write("# My stars #\n")
for current_page in range(1, max_page + 1):
    repo_url = 'https://github.com/' + your_login + '?page=' + str(
        current_page) + '&tab=stars'
    print('---------------------------------------------------')
    print('Processing ' + str(current_page) + ' page.')
    print('---------------------------------------------------')
    g.go(repo_url)

    for elem in g.doc.select('//div[@class="d-inline-block mb-1"]/h3/a'):
        text_file.write("- [{1}](\"{0}\")\n".format(
            g.make_url_absolute(elem.attr('href')), elem.text()))

text_file.close()