Esempio n. 1
0
 def start_requests(self):
     # cookiejar 参数用来自动管理cookie  filepath为了记录目录,会传递给response
     return [Request(self.target_url + str(self.page['/']), meta={'cookiejar': 1, 'filepath': '/'})]
Esempio n. 2
0
 def parse_sub_categories(self, response):
     sub_categories = response.xpath("//*[contains(@href, 'dir/index?')]/@href").extract()
     for link in sub_categories:
         url = "https://br.answers.yahoo.com%s" % link
         yield Request(url, callback=self.parse_question)
Esempio n. 3
0
 def start_requests(self):
     for i in xrange(1, 10):
         yield Request('http://www.kuaidaili.com/free/inha/%s/' % i,
                       callback=self.parse)
Esempio n. 4
0
    def start_requests(self):
        print("start request.", self.parse_idx)

        yield Request( self.start_urls[0], callback=self.parse_fun[self.parse_idx] )
 def start_requests(self):
     base_url = 'http://guba.eastmoney.com/list,%s.html'
     for ticker_id in open('ticker_list.txt'):
         url = base_url % ticker_id.strip()
         yield Request(url, self.parse_item)
     pass
            url=SETTINGS["CRAWLERA_FETCH_URL"],
            status=200,
            headers={
                "Content-Type": "application/json",
                "Content-Encoding": "gzip",
                "Transfer-Encoding": "chunked",
                "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
                "Proxy-Connection": "close",
                "Connection": "close",
            },
            request=Request(
                url=SETTINGS["CRAWLERA_FETCH_URL"],
                meta={
                    "crawlera_fetch": {
                        "timing": {"start_ts": mocked_time()},
                        "original_request": request_to_dict(
                            Request("https://fake.host.com"),
                            spider=dummy_spider,
                        ),
                    }
                },
            ),
            body=b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""",  # noqa: E501
        ),
        "expected": TextResponse(
            url="https://fake.host.com",
            status=123,
            headers={"Fake-Header": "true"},
            body=b"""foobar""",  # noqa: E501
        ),
    }
)
Esempio n. 7
0
 def start_requests(self):
     for url in self.start_urls:
         yield Request(url, cookies=self.cookie_dict)
Esempio n. 8
0
 def get_media_requests(self, item, info):
     for url in item['file_urls']:
         yield Request(url, meta={'item': item})
Esempio n. 9
0
 def start_requests(self):
     yield Request(url='https://movie.douban.com/chart',
                   callback=self.parse_rank)
Esempio n. 10
0
 def start_requests(self):
     # cookiejar 参数用来自动管理cookie, 可以自动管理多个,根据cookiejar对应的值不同
     return [Request('http://temp.wuxingxiangsheng.com/test/request', meta = {'cookiejar':1})]
Esempio n. 11
0
    def start_requests(self):
        #meta = {"search_url" : "search_url"}
        ##书籍
        #yield Request("http://www.jstor.org/stable/10.7249/mg358cf.10?seq=1#page_scan_tab_contents", self.parse_issue, meta = meta, dont_filter = True)
        ##期刊
        ##yield Request("http://www.jstor.org/stable/40279148?Search=yes&resultItemClick=true&searchText=agriculture&searchText=OR&searchText=agricultural&searchText=OR&searchText=rural&searchUri=%2Faction%2FdoBasicSearch%3Fgroup%3Dnone%26amp%3Bsd%3D2009%252F03%26amp%3BsearchType%3DfacetSearch%26amp%3BQuery%3Dagriculture%2BOR%2Bagricultural%2BOR%2Brural%26amp%3Bpage%3D6%26amp%3Bfc%3Doff%26amp%3Bed%3D2009%252F04%26amp%3Bacc%3Don%26amp%3Bwc%3Don&seq=1#page_scan_tab_contents", self.parse_issue, meta = meta, dont_filter = True)
        #return
        if self.url_file:
            #指定了爬取哪些url
            with open(self.url_file) as f:
                for line in f:
                    json_data = json.loads(line)
                    if "url" in json_data:
                        url = json_data["url"]
                        if url in self.crawled_url:
                            print "filter url: %s" % url
                        else:
                            meta = {"origin_url": json_data["url"]}
                            url = "https://www.jstor.org/stable/25097205?seq=1#page_scan_tab_contents"
                            yield Request(url,
                                          self.parse_issue,
                                          meta=meta,
                                          dont_filter=True)
                            return
                            yield Request(json_data["url"],
                                          self.parse_issue,
                                          meta=meta,
                                          dont_filter=True)
                        #return
            return

        i = 0
        final_page = 250  #11.25, total result is 25028
        #final_page = 1

        start_year = 2005
        end_yrar = 2017
        start_month = 1
        end_month = 12
        current_year = start_year
        while (current_year <= end_yrar):
            current_month = start_month
            while (current_month < end_month):
                start_date = str(current_year) + "%2F" + "%02d" % current_month
                end_date = str(
                    current_year) + "%2F" + "%02d" % (current_month + 1)
                start_url = "http://www.jstor.org/action/doBasicSearch?searchType=facetSearch&page=1&sd=%s&ed=%s&wc=on&acc=on&fc=off&Query=agriculture+OR+agricultural+OR+rural&group=none" % (
                    start_date, end_date)
                if start_url in self.crawled_url:
                    print "start_url crawled, filter: %s" % start_url
                else:
                    meta = {
                        "page": 1,
                        "current_year": current_year,
                        "current_month": current_month
                    }
                    yield Request(start_url,
                                  self.parse_result_of_date,
                                  meta=meta,
                                  dont_filter=True)
                current_month = current_month + 1
            current_year = current_year + 1
Esempio n. 12
0
 def start_requests(self):
     for url in self.start_urls:
         yield Request(url,
                       cookies={'xxx.com': 'true'},
                       callback=self.parse,
                       dont_filter=True)
Esempio n. 13
0
 def start_requests(self):
     yield Request("https://kyfw.12306.cn/otn/userCommon/allProvince", callback=self.parse, meta={"turn": self.turn})
Esempio n. 14
0
 def start_requests(self):
     return [
         Request(
             "http://api.rtvslo.si/ava/getShows?client_id=82013fb3a531d5414f478747c1aca622",
             callback=self.parse_oddaje)
     ]
Esempio n. 15
0
 def list_parse(self, response):
     urls = response.xpath('//div[@class="ContentDesc"]/a/@href').extract()
     for url in urls:
         yield Request(url, meta=response.meta, callback=self.detail_parse)
Esempio n. 16
0
    def next_request(self):

        self.logger.info(
            "length of queue %s is %s" %
            (self.queue_name, self.redis_conn.zcard(self.queue_name)))
        item = None
        if time.time() - self.request_interval < self.last_acs_time:
            return item
        if self.settings.getbool("CUSTOM_REDIS"):
            item = self.redis_conn.zpop(self.queue_name)
        else:
            pipe = self.redis_conn.pipeline()
            pipe.multi()
            pipe.zrange(self.queue_name, 0,
                        0).zremrangebyrank(self.queue_name, 0, 0)
            result, count = pipe.execute()

            if result:
                item = result[0]

        if item:
            self.last_acs_time = time.time()
            item = pickle.loads(item)
            self.present_item = item
            headers = item.get("headers", {})
            body = item.get("body")
            if item.get("method"):
                method = item.get("method")
            else:
                method = "GET"

            try:
                req = Request(item['url'],
                              method=method,
                              body=body,
                              headers=headers)
            except ValueError:
                req = Request('http://' + item['url'],
                              method=method,
                              body=body,
                              headers=headers)

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = getattr(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = getattr(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0

            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']

            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], (str, bytes)):
                    req.cookies = parse_cookie(item['cookie'])

            return req
Esempio n. 17
0
 def parse(self, response):
     urls = response.xpath('//div[@class="tcat"]//a/@href').extract()
     for url in urls:
         if "10032191" not in url:
             continue
         yield Request(url, callback=self.get_all_list)
Esempio n. 18
0
def gen_request(url, callback, item=None):
    r = Request(url, callback=callback)
    if item:
        r.meta['item'] = item
    return r
Esempio n. 19
0
 def start_requests(self):
     open('./ips.json', 'w').close()
     request = Request(url= "https://free-proxy-list.net/", callback=self.parse, headers=self.free_list_header)
     yield request
Esempio n. 20
0
    def parse(self, response):

        data = response.text[22:-2]
        data = json.loads(data)

        posts = data['posts']
        for post in posts:

            if post['type'] == 'video':
                video_player = post['video-player']
                try:
                    video_id = re.findall(r'/(tumblr_[^_]*)_[^\.]*?\.jpg',
                                          video_player)[0]
                    video_url = 'https://vtt.tumblr.com/{}_480.mp4'.format(
                        video_id)
                    video_name = video_url.split('/')[-1]
                    video_path = post['type'] + '/' + video_name

                    item = TumblrspiderItem()
                    item['file_url'] = video_url
                    item['file_path'] = video_path
                    item['file_type'] = post['type']
                    yield item

                except IndexError:
                    print(video_player)

            elif post['type'] == 'photo':
                photo_url = post['photo-url-1280']
                photo_name = photo_url.split('/')[-1]
                photo_path = post['type'] + '/' + photo_name

                item = TumblrspiderItem()
                item['file_url'] = photo_url
                item['file_path'] = photo_path
                item['file_type'] = post['type']
                yield item

            else:
                print(post['type'])

            try:
                reblogged_url = post['reblogged-from-url']
            except KeyError:
                continue
            try:
                user_name = re.findall(r'://([^\.]*)\.tumblr\.com',
                                       reblogged_url)[0]
            except IndexError:
                continue
            print(user_name)
            url = '''https://{}.tumblr.com/api/read/json?start=0&num=200'''.format(
                user_name)

            depth = response.meta['depth'] + 1

            if depth <= self.max_depth:

                yield Request(url,
                              headers=self.get_headers(),
                              callback=self.parse,
                              meta={'depth': depth})
 def get_media_requests(self, item, info):
     if isinstance(item, PDFItem):
         yield Request(url=item['file_urls'],
                       headers=DEFAULT_REQUEST_HEADERS,
                       meta={'file_names': item['file_names']})
Esempio n. 22
0
 def start_requests(self):
     for url in self.start_urls:
         yield Request(url, dont_filter=True, headers={'Host': 'jandan.net'}, callback=self.parse)
Esempio n. 23
0
     status=200,
     headers={
         "Content-Type": "application/json",
         "Content-Encoding": "gzip",
         "Transfer-Encoding": "chunked",
         "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
         "Proxy-Connection": "close",
         "Connection": "close",
     },
     request=Request(
         SETTINGS["CRAWLERA_FETCH_URL"],
         meta={
             "crawlera_fetch": {
                 "timing": {
                     "start_ts": mocked_time()
                 },
                 "original_request": {
                     "url": "https://fake.host.com"
                 },
             }
         },
     ),
     body=
     b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""",  # noqa: E501
 ),
 "expected":
 TextResponse(
     url="https://fake.host.com",
     status=123,
     headers={"Fake-Header": "true"},
     body=b"""foobar""",  # noqa: E501
Esempio n. 24
0
 def parse(self, response):
     categories = response.xpath('//ul[@class="product-categories"]/li/a')
     for cat in categories:
         cat_url = cat.xpath('./@href').get()
         parent_drug = cat.xpath('./text()').get()
         yield Request(cat_url, callback=self.list_parse, meta={"parent_drug": parent_drug})
Esempio n. 25
0
    def start_requests(self):
        dc = datetime.datetime.now()
        dc = dc.replace(minute=0, second=0, microsecond=0)
        self.start = dc
        dn = dc + datetime.timedelta(hours=1)

        self.d = defaultdict(int)
        c = 0
        q = self.api.getDataResources(self.snapshot,
                                      format=self.format,
                                      portalid=self.portalID)
        log.info("Querying for uris", start=dc, end=dn, query=str(q))
        schedules = [s for s in q]
        log.info("Received seed uris", count=len(schedules))
        #schedules=[Schedule(uri='http://umbrich.org/', experiment='test')]
        for s in schedules:  #],Schedule(uri='http://polleres.net/', experiment='test'),Schedule(uri='http://notavailable/', experiment='test')]:
            domain = ''
            try:
                parsed_uri = urlparse(s.uri)
                domain = '{uri.netloc}'.format(uri=parsed_uri)
            except:
                domain = 'error'
            self.d[domain] += 1

            # set hard link to git location
            filename = None
            if self.git_location:
                d = self.api.getDatasetData(md5=s.md5)
                # try to get name
                if 'name' in d.raw:
                    dir_name = d.raw['name']
                else:
                    dir_name = s.id

                # try to get resource name
                filename = None
                for r in d.raw.get('resources', []):
                    if r.get('url', '') == s.uri:
                        filename = r.get('name')
                        if not filename:
                            filename = r.get('id')
                        break
                if not filename:
                    filename = s.uri.split('/')[-1]
                    if len(filename) < 4:
                        filename = s.uri[:-150]

                filename = utils.helper_functions.format_filename(filename)
                res_dir = os.path.join(self.git_location, self.portalID,
                                       dir_name, 'resources')
                if not os.path.exists(res_dir):
                    os.mkdir(res_dir)
                filename = os.path.join(res_dir, filename)

            yield Request(s.uri,
                          dont_filter=True,
                          meta={
                              'handle_httpstatus_all': True,
                              'domain': domain,
                              'referrer': None,
                              'snapshot': self.snapshot,
                              'git': filename,
                              'orig_url': s.uri
                          })
            self.crawler.stats.inc_value('seeds')
            c = +1

        self.crawler.stats.set_value('seedPLDs', len(self.d))
        self.crawler.stats.set_value('domains', dict(self.d))
        log.info("InitScheduled", uris=c)
Esempio n. 26
0
 def parse(self, response):
     ref_urls = response.xpath('//ol/li/div//a/@href').extract()
     for ref_url in ref_urls:
         url = self.base_url + ref_url
         yield Request(url, callback=self.list_parse)
Esempio n. 27
0
 def parse_question(self, response):
     questions_links = response.xpath("//*[contains(@href, 'question/index?qid')]/@href").extract()
     for link in questions_links:
         url = "https://br.answers.yahoo.com%s" % link
         yield Request(url, callback=self.extract_question)
Esempio n. 28
0
 def parse(self, response):
     tmp = "http://www.hi-chemical.com/?s={}&post_type=product"
     parents = response.xpath('//table[contains(@id,"table")]//tr[position()>1]/td/input/@value').extract()
     for parent in parents:
         yield Request(tmp.format(parent), meta={"parent": parent}, callback=self.list_parse)
Esempio n. 29
0
 def start_requests(self):
     yield Request(self.myurl, self.parse)
Esempio n. 30
0
 def start_requests(self):
     yield Request("http://www.12306.cn/mormhweb/kyyyz/", callback = self.parse, meta = {"turn":self.turn})