Exemple #1
0
    def parse(self, response):
        html = response.body
        tree = build_div_tree(html)
        soup = tree.get_root().soup_get()
        write2file = self.write2file
        divs = []

        #@ carhome
        # __navigation__ = 0
        leaf = soup.find('div', class_='cartree')
        if leaf is not None:
            write2file(leaf, 'cartree', __navigation__)
            for a in leaf.find_all('a'):
                url = a['href']
                url = urljoin(self.base_url, url)
                yield scrapys.SplashRequest(url,
                                            callback=self.parse,
                                            args={'wait': 0.5})
            divs.append(leaf)

        leaf = soup.find('div', class_='uibox-con')
        if leaf is not None:
            write2file(leaf, 'uibox-con', __navigation__)
            for a in leaf.find_all('a'):
                url = a['href']
                url = urljoin(self.base_url, url)
                yield scrapys.SplashRequest(url,
                                            callback=self.parse,
                                            args={'wait': 0.5})
            divs.append(leaf)
        leaf = soup.find('div', class_='uibox-con-search')
        if leaf is not None:
            write2file(leaf, 'uibox-con-search', __navigation__)
            for a in leaf.find_all('a'):
                url = a['href']
                url = urljoin(self.base_url, url)
                yield scrapys.SplashRequest(url,
                                            callback=self.parse,
                                            args={'wait': 0.5})
            divs.append(leaf)
        ### __picture__ = 1
        # 转换思路?先全部标为0,之后再直接搜img然后对图片标1,读取的时候注意一下列就行了
        leaf = soup.find('div', class_='main')
        if leaf is not None:
            write2file(leaf, 'main', __picture__)
            divs.append(leaf)
        ### __header_nav__ = 2
        leaf = soup.find('div', class_='header-nav')
        if leaf is not None:
            write2file(leaf, 'header-nav', __header_nav__)
            divs.append(leaf)  # 这个推入的是引用但是不需要做深拷贝因为leaf重新赋值之后id会改变
        ### __other__ = 3
        for leaf in soup.find_all('div', lambda d: d not in divs):
            write2file(leaf, 'other', __other__)
Exemple #2
0
    def after_login(self, response):
        """ Makes first requests to get movements after sucessful login """
        if "error" in response.url:
            msg = "Failed login"
            logger.warn(msg)
            raise scrapy.exceptions.CloseSpider(reason=msg)

        last_movement_date = Movement.get_last_date()
        # If no Movement has ever been parsed set start date to June 2015 else
        # sets it to some days before the last movement date
        if last_movement_date == date.min:
            last_movement_date = date(2015, 6, 1)
        else:
            last_movement_date = last_movement_date - timedelta(days=3)

        # Starts parsing of losses
        losses_request = scrapy_splash.SplashRequest(
            url=self.LOSSES_URL,
            callback=self.parse_movements,
            endpoint="execute",
            cache_args=["lua_source"],
            dont_filter=True,
            args={
                "lua_source": self.movements_lua,
                "moneymap_url": self.MONEYMAP_URL,
                "meseanno": last_movement_date.strftime("%m%Y"),
                "dopoAggiornamento": "false",
                "idBrand": "",
            },
            meta={"date": last_movement_date},
        )

        # Starts parsing of revenues
        revenues_request = scrapy_splash.SplashRequest(
            url=self.REVENUES_URL,
            callback=self.parse_movements,
            endpoint="execute",
            cache_args=["lua_source"],
            dont_filter=True,
            args={
                "lua_source": self.movements_lua,
                "moneymap_url": self.MONEYMAP_URL,
                "meseanno": last_movement_date.strftime("%m%Y"),
                "dopoAggiornamento": "false",
                "idBrand": "",
            },
            meta={"date": last_movement_date},
        )
        revenues_request.meta["date"] = last_movement_date

        return [losses_request, revenues_request]
    def new_request(self,
                    url,
                    depth,
                    retry,
                    pagelink,
                    country,
                    territory,
                    retries,
                    links=None):
        """Return a new request object."""
        request = scrapy_splash.SplashRequest(url=self.get_next_page(
            url, pagelink),
                                              callback=self.parse,
                                              endpoint='execute',
                                              args={
                                                  'lua_source':
                                                  self.lua_script,
                                                  'timeout': 90
                                              })

        request.meta['depth'] = depth
        request.dont_filter = retry
        request.meta['pagelink'] = pagelink
        request.meta['country'] = country
        request.meta['territory'] = territory
        request.meta['retries'] = retries

        return request
Exemple #4
0
    def parse(self, response):
        # Store CSV file
        filename = 'companies.csv'
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log('Saved file %s' % filename)

        # Read CSV file
        with open(filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                if line_count == 0:
                    line_count += 1
                else:
                    # for test only, process company code with TLS
                    if len(row) == 3:
                        if len(row[1]) == 3:
                            if line_count >= self.limit:
                                pass
                            else:
                                self.log('COMPANY ' + row[1])
                                request = scrapy_splash.SplashRequest(
                                    url=
                                    'https://www.asx.com.au/asx/share-price-research/company/'
                                    + row[1],
                                    callback=self.parse_price,
                                    args={
                                        "wait": 10,
                                        'timeout': 1800,
                                        'images': 0,
                                    })
                                request.meta['code'] = row[1]
                                line_count += 1
                                yield request
 def start_requests(self):
     urls = [
         self.facebook_base_url +
         f'/results/?q&content_types[0]=publication&sort_by=relevance&view=list&page={i}'
         for i in range(1, 60)
     ]
     lua_script = """
     function main(splash, args)
         splash.private_mode_enabled = false
         assert(splash:go(args.url))
         assert(splash:wait(10.0))
         return {
             html = splash:html(),
             png = splash:png()
                 }
     end
     """
     for url in urls:
         yield scrapy_splash.SplashRequest(url=url,
                                           callback=self.parse,
                                           endpoint='execute',
                                           args={
                                               'lua_source': lua_script,
                                               'html': 1,
                                               'wait': 30
                                           })
Exemple #6
0
 def parse(self, response):
     for item in response.css('div.imgbox'):
         href = item.css('a::attr(href)').extract_first()
         href = urljoin(self.base_url, href)
         yield scrapys.SplashRequest(href,
                                     callback=self.img_parse,
                                     args={'wait': 0.5})
    def parse_word(self, response):
        #with open('word_page.html', 'w+b') as f:
        #   f.write(response.body)

        # get json from html
        json_text = response.xpath('//pre/text()').extract()[0]

        # get words
        data_dict = json.loads(json_text)
        total = data_dict['total']
        page = data_dict['page']

        # check if next page exist
        if page < total:
            url = urlunparse(
                ('http', 'dict.eudic.net', '/StudyList/GridData', '',
                 'catid=&_search=false&rows=50&page={}&sidx=&sord=asc'.format(
                     page + 1), ''))
            yield scrapy_splash.SplashRequest(
                url=url,
                callback=self.parse_word,
                headers=self.headers_word,
                endpoint='execute',
                cache_args=['lua_source'],
                args={'lua_source': script},
                meta={'cookiejar': response.meta['cookiejar']})

        # save word
        for word_dict in data_dict['rows']:
            word = EudicWordsItem()
            word['word'] = word_dict['id']
            yield word
Exemple #8
0
    def start_requests(self):
        self.link_server = redis.StrictRedis(host='127.0.0.1', port=6379, db=1)
        self.img_server = redis.StrictRedis(host='127.0.0.1', port=6379, db=2)
        # 读取数据建立模型
        data = np.loadtxt('classify_0.data', dtype=float,
                          delimiter=',', usecols=(0, 1, 2, 3, 4))
        x, y = np.split(data, (4,), axis=1)  # 参数设置
        for i in range(1, 4):  # 是否跳过other项它不好分辨,other在classify_3里面
            data = np.loadtxt('classify_{}.data'.format(
                i), dtype=float, delimiter=',', usecols=(0, 1, 2, 3, 4))
            if len(data) > 150:
                data = data[0:150, :]
            x_temp, y_temp = np.split(data, (4,), axis=1)  # 参数设置
            x = np.vstack((x, x_temp))
            y = np.vstack((y, y_temp))
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, random_state=1, train_size=0.8)
        # svm
        # self.model = svm.SVC(kernel='rbf')
        # decision tree
        # self.model = tree.DecisionTreeClassifier()
        # knn
        # self.model = neighbors.KNeighborsClassifier()
        # bayes
        self.model = naive_bayes.MultinomialNB()
        # MLP
        # self.model = neural_network.MLPClassifier(
        #   solver='lbfgs', activation='tanh')
        # to fit the model
        self.model.fit(x_train, y_train.ravel())

        for item in self.start_urls:
            yield scrapys.SplashRequest(
                item, callback=self.parse, args={'wait': 0.5})
Exemple #9
0
    def parse_colleges(self, response):
        #-------------------------------------------#

        startUrl = "https://sports.usatoday.com/ncaa/salaries/"

        #basketball
        #startUrl = "https://sports.usatoday.com/ncaa/salaries/mens-basketball/coach/"
        #-------------------------------------------#

        # next step is to simply create a dict that maps
        all_them = "/html/body/div[6]/div[4]/div[2]/div[1]/div/div[1]/div/section/div[2]/table/tbody/tr/td[@class='']"
        lol = response.selector.xpath(all_them).extract()
        count = 0

        # click every second out of 4
        to_click = True
        for one in lol:
            count += 1
            #print(str(count)+" " + one)
            if (count % 2 == 0 and to_click):
                # request and click on button
                to_click = False
            # print(str(count)+" " + one)

            if (count % 4 == 0):
                to_click = True

        # count is now total number of colleges , I need 1-count

        #total_num = 131
        total_num = 81  #basketball
        for one in range(1, total_num):
            self.listofcolleges.add(one)

        while (len(self.listofcolleges) != 0):
            counter = self.listofcolleges.pop()
            button = self.return_css_lol(counter)
            time.sleep(3)
            LUA_SCRIPT = """
function main(splash)
    assert(splash:go(splash.args.url))
    local element = splash:select('%s')
    local bounds = element:bounds()
    assert(element:mouse_click{x=bounds.width/3, y=bounds.height/3})
    assert(splash:wait(5))
    return splash:html()
end
""" % (button)
            #print(button)
            # print(LUA_SCRIPT)
            SCRAPY_ARGS = {'lua_source': LUA_SCRIPT}
            the_request = scrapy_splash.SplashRequest(
                url=startUrl,
                callback=self.parse_college,
                endpoint='execute',
                args=SCRAPY_ARGS)
            the_request.meta['counter'] = counter

            yield the_request
Exemple #10
0
 def start_requests(self):
     for beerId in list(
             filter(lambda x: x not in self.ids_seen, range(200, 300))):
         yield scrapy_splash.SplashRequest(
             self.start_url + str(beerId) + '/',
             self.parse,
             args={'lua_source': self.lua_script},
             meta={'id': beerId})
 def start_requests(self):
     for i in range(len(self.start_urls)):
         yield scrapy_splash.SplashRequest(url=self.start_urls[i],
                                           callback=self.collect_data,
                                           dont_filter=True,
                                           endpoint='render.html',
                                           args={'wait': 4},
                                           meta={'url': self.start_urls[i]})
 def start_requests(self):
     """
     Handle start_requests directly so we can explicitly return
     SplashRequest objects with a three second wait.
     """
     for url in self.start_urls:
         yield scrapy_splash.SplashRequest(url,
                                           self.parse,
                                           endpoint="execute",
                                           args={"lua_source": lua_script})
 def request(self, url, domain):
     return scrapy_splash.SplashRequest(
         url,
         endpoint='render.json',
         args={
             'png': 1,
             'html': 1
         },
         callback=partial(self.parse, domain=domain),
     )
Exemple #14
0
    def start_requests(self):
        """
        initial requests for the crawler

        :return:
        """
        urls = self.build_urls()

        for url in urls:
            yield scrapy_splash.SplashRequest(url=url,
                                              callback=self.parse_zone)
Exemple #15
0
 def start_requests(self):
     self.web_parameter = {
         'link_num': 0,
         'img_num': 0,
         'content_length': 0,
         'tag_num': 0,
     }
     for item in self.start_urls:
         yield scrapys.SplashRequest(item,
                                     callback=self.parse,
                                     args={'wait': 0.5})
Exemple #16
0
 def parse(self, response):
     for item in response.css('a.tsla-header-nav--list_link'):
         url = item.css('::attr(href)').extract_first()
         next_page = response.urljoin(url)
         key = url.split('/')[-1]
         if key in self.callbacks:
             request = scrapys.SplashRequest(
                 next_page, callback=self.callbacks[key], args={'wait': 0.5})
             request.meta['key'] = key
             yield request
         else:
             yield None
Exemple #17
0
 def img_parse(self, response):
     # 点击各个图片
     for item in response.css('.uibox-con ul li'):
         url = item.css('a::attr(href)').extract_first()
         url = urljoin(self.base_url, url)
         request = scrapys.SplashRequest(url,
                                         callback=self.return_item,
                                         args={'wait': 0.5})
         request.meta['brand'] = response.meta['brand']
         request.meta['series'] = response.meta['series']
         request.meta['kind'] = response.meta['kind']
         yield request
 def start_requests(self):
     urls = ['https://research.yandex.com/publications']
     for url in urls:
         yield scrapy_splash.SplashRequest(url=url,
                                           callback=self.parse,
                                           endpoint='/execute',
                                           args={
                                               'html': 1,
                                               'lua_source':
                                               self.lua_script,
                                               'wait': 30,
                                           })
Exemple #19
0
 def brand_parse(self, response):
     # 车系选项
     for item in response.css('.cartree ul li.current dl dd a'):
         # for item in response.css('#series_2368'):  # 调试用,爬取开上面
         url = item.css('::attr(href)').extract_first()
         url = urljoin(self.base_url, url)
         series = item.css('::text').extract_first()
         request = scrapys.SplashRequest(url,
                                         callback=self.series_parse,
                                         args={'wait': 0.5})
         request.meta['brand'] = response.meta['brand']
         request.meta['series'] = series.strip()
         yield request
Exemple #20
0
 def kind_parse(self, response):
     # 选择外观选项
     for item in response.css('div.search-pic li'):
         if item.css('::text').extract_first() == '车身外观':
             url = item.css('a::attr(href)').extract_first()
             url = urljoin(self.base_url, url)
             request = scrapys.SplashRequest(url,
                                             callback=self.img_parse,
                                             args={'wait': 0.5})
             request.meta['brand'] = response.meta['brand']
             request.meta['series'] = response.meta['series']
             request.meta['kind'] = response.meta['kind']
             yield request
Exemple #21
0
 def start_requests(self):
     urls = [
         'https://www.instagram.com/explore/tags/fashion/',
     ]
     for url in urls:
         yield scrapy_splash.SplashRequest(
             url,
             self.parse,
             endpoint='execute',
             args={
                 'wait': 2,
                 'lua_source': script2
             },
         )
Exemple #22
0
 def parse(self, response):
     pages = response.xpath("//a[re:test(@id, '\d$')]")
     for page_aid in pages:
         page_id = page_aid.xpath("text()").extract_first()
         req_url = self.start_urls[0] + page_aid.xpath('@href').extract()[0]
         request = scrapy_splash.SplashRequest(
             req_url,
             self.parse_images,
             args={'wait': 2},
             slot_policy=scrapy_splash.SlotPolicy.PER_DOMAIN,
         )
         request.meta['page'] = int(page_id)
         request.meta['n_pages'] = len(pages)
         yield request
Exemple #23
0
 def start_requests(self):
     # start_urls = ['https://item.jd.com/6946605.html']  # 编码为'gbk'
     start_urls = ['https://music.163.com']
     change_url = 'https://music.163.com/artist?id=3684'
     args = {
         'html': 1,
         'png': 1,
         'wait': 0.5,
     }
     for url in start_urls:
         # 使用scrapy-splash的方法一
         yield scrapy_splash.SplashRequest(url,
                                           self.parse,
                                           args={'wait': 2.5},
                                           splash_url=change_url)
Exemple #24
0
    def start_requests(self):
        if (len(self.still_need) == 0):
            print("All good I'm done")
            quit()

            #-----------------------------------------#

        startUrl = "https://sports.usatoday.com/ncaa/salaries/"

        #basketball
        #startUrl = "https://sports.usatoday.com/ncaa/salaries/mens-basketball/coach/"
        #-----------------------------------------#

        yield scrapy_splash.SplashRequest(url=startUrl,
                                          callback=self.parse_colleges,
                                          args={'wait': 0.5})
Exemple #25
0
 def parse(self, response):
     # 品牌选项
     for item in response.css('.cartree ul li a'):
         # for item in response.css('#b134'):  # 调试用,爬取开上面
         url = item.css('::attr(href)').extract_first()
         url = urljoin(self.base_url, url)
         if self.bf.isContains(url):
             continue
         else:
             self.bf.insert(url)
             brand = item.css('::text').extract_first()
             request = scrapys.SplashRequest(url,
                                             callback=self.brand_parse,
                                             args={'wait': 0.5})
             request.meta['brand'] = brand.strip()
             yield request
Exemple #26
0
 def series_parse(self, response):
     # 选择车型选项
     for item in response.css('div.search-pic dl'):
         urls = item.css('dd a::attr(href)').extract()
         kinds = item.css('dd a::text').extract()
         years = item.css('dt::text').extract()
         for url, year, kind in zip(urls, years, kinds):
             url = urljoin(self.base_url, url)
             kind = year + kind
             request = scrapys.SplashRequest(url,
                                             callback=self.kind_parse,
                                             args={'wait': 0.5})
             request.meta['brand'] = response.meta['brand']
             request.meta['series'] = response.meta['series']
             request.meta['kind'] = kind.strip()
             yield request
    def start_requests(self):
        url = 'http://www.inveno.cn/'
        yield scrapy_splash.SplashRequest(
            url,
            self.parse_result,
            args={
                # optional; parameters passed to Splash HTTP API
                'wait': 0.5,

                # 'url' is prefilled from request url
                # 'http_method' is set to 'POST' for POST requests
                # 'body' is set to request body for POST requests
            },
            splash_url=
            'http://192.168.1.125:8050/',  # optional; overrides SPLASH_URL
            slot_policy=scrapy_splash.SlotPolicy.PER_DOMAIN,  # optional
        )
    def after_login(self, response):
        #with open('login_page.html', 'w+b') as f:
        #    f.write(response.body)

        # http://dict.eudic.net/StudyList/GridData?catid=&_search=false&rows=5&page=1&sidx=&sord=asc
        url = urlunparse(
            ('http', 'dict.eudic.net', '/StudyList/GridData', '',
             'catid=&_search=false&rows=50&page=1&sidx=&sord=asc', ''))

        return scrapy_splash.SplashRequest(
            url=url,
            callback=self.parse_word,
            headers=self.headers_word,
            endpoint='execute',
            cache_args=['lua_source'],
            args={'lua_source': script},
            meta={'cookiejar': response.meta['cookiejar']})
Exemple #29
0
    def parse_price(self, response):
        dividens = response.xpath(
            '//td[@class="overview-dividends"]/table//tr')
        json = {}
        json[response.meta['code']] = {
            "summary": {
                "Summary_Value":
                response.xpath('//span[@ng-show="share.last_price"]/text()'
                               ).extract_first(),
                "market_cap":
                response.xpath(
                    '//div[@ng-switch="share.market_cap"]/span/text()').
                extract_first(),
                "dividens": {
                    "most_recent":
                    dividens[0].xpath('td[2]/span//text()').extract_first(),
                    "Dividend ex-date":
                    dividens[1].xpath('td[2]//text()').extract_first(),
                    "Dividend pay date":
                    dividens[2].xpath('td[2]//text()').extract_first(),
                    "Franking":
                    dividens[3].xpath('td[2]//text()').extract_first(),
                    "Annual dividend yield":
                    dividens[4].xpath('td[2]/span//text()').extract_first(),
                }
            }
        }

        request = scrapy_splash.SplashRequest(
            url='https://www.asx.com.au/asx/share-price-research/company/' +
            response.meta['code'] + '/statistics/shares',
            callback=self.parse_statistic,
            args={
                # optional; parameters passed to Splash HTTP API
                'timeout': 1800,
                "wait": 10,
                'images': 0,
                # 'url' is prefilled from request url
                # 'http_method' is set to 'POST' for POST requests
                # 'body' is set to request body for POST requests
            },
        )
        request.meta['json'] = json
        request.meta['code'] = response.meta['code']

        yield request
Exemple #30
0
    def parse_page(self, response):
        content_selector = content = response.css(
            'div#content div.wiki-content')
        title = response.css('#title-text a::text')[0].extract()
        bread_crumbs = response.css('ol#breadcrumbs a::text').extract()
        content = content_selector[0].extract()

        path = ''
        for bread_crumb in bread_crumbs:
            path = path + bread_crumb + '/'

        # 图片信息
        content = content.replace('&amp;', '&')
        imgs = content_selector.css('img')
        i = 1
        for img in imgs:
            src = img.css('::attr(src)')[0].extract()
            img_name = title + str(i) + '.png'
            # content = content.decode('utf-8').replace(src.decode('utf-8'), img_name).encode('utf-8')
            content = content.replace(src, img_name)
            i += 1

            img_url = self.base_url + src
            yield scrapy_splash.SplashRequest(
                url=img_url,
                callback=self.parse_img,
                args={
                    'wait': 0.1,
                    'lua_source': script_img,
                },
                meta={
                    'img_name': img_name,
                    'path': path
                },
                endpoint='execute',  # optional; default is render.html
                splash_url=
                'http://127.0.0.1:8050',  # optional; overrides SPLASH_URL
                slot_policy=scrapy_splash.SlotPolicy.PER_DOMAIN,  # optional
            )

        item = ConfluenceItem()
        item['name'] = title
        item['path'] = path
        item['content'] = content

        yield item